languagemodels

  1import requests
  2import datetime
  3import json
  4import re
  5from typing import overload
  6
  7from languagemodels.config import config
  8from languagemodels.preprocess import get_html_paragraphs
  9from languagemodels.inference import (
 10    generate,
 11    rank_instruct,
 12    parse_chat,
 13    list_tokens,
 14)
 15from languagemodels import embeddings
 16
 17docs = embeddings.RetrievalContext()
 18
 19
 20def complete(prompt: str) -> str:
 21    """Provide one completion for a given open-ended prompt
 22
 23    :param prompt: Prompt to use as input to the model
 24    :return: Completion returned from the language model
 25
 26    Examples:
 27
 28    >>> complete("Luke thought that he") #doctest: +SKIP
 29    'was going to be a doctor.'
 30
 31    >>> complete("There are many mythical creatures who") #doctest: +SKIP
 32    'are able to fly'
 33
 34    >>> complete("She hid in her room until") #doctest: +SKIP
 35    'she was sure she was safe'
 36    """
 37
 38    result = generate(
 39        ["Write a sentence"],
 40        prefix=prompt,
 41        max_tokens=config["max_tokens"],
 42        temperature=0.7,
 43        topk=40,
 44    )[0]
 45
 46    if result.startswith(prompt):
 47        prefix_length = len(prompt)
 48        return result[prefix_length:]
 49    else:
 50        return result
 51
 52
 53@overload
 54def do(prompt: list) -> list:
 55    ...
 56
 57
 58@overload
 59def do(prompt: str) -> str:
 60    ...
 61
 62
 63def do(prompt, choices=None):
 64    """Follow a single-turn instructional prompt
 65
 66    :param prompt: Instructional prompt(s) to follow
 67    :param choices: If provided, outputs are restricted to values in choices
 68    :return: Completion returned from the language model
 69
 70    Note that this function is overloaded to return a list of results if
 71    a list if of prompts is provided and a single string if a single
 72    prompt is provided as a string
 73
 74    Examples:
 75
 76    >>> do("Translate Spanish to English: Hola mundo!") #doctest: +SKIP
 77    'Hello world!'
 78
 79    >>> do("Pick the planet from the list: baseball, Texas, Saturn")
 80    '...Saturn...'
 81
 82    >>> do(["Pick the planet from the list: baseball, Texas, Saturn"] * 2)
 83    ['...Saturn...', '...Saturn...']
 84
 85    >>> do(["Say red", "Say blue"], choices=["red", "blue"])
 86    ['red', 'blue']
 87
 88    >>> do("Classify as positive or negative: LLMs are bad",
 89    ... choices=["Positive", "Negative"])
 90    'Negative'
 91
 92    >>> do("Classify as positive or negative: LLMs are great",
 93    ... choices=["Positive", "Negative"])
 94    'Positive'
 95    """
 96
 97    prompts = [prompt] if isinstance(prompt, str) else prompt
 98
 99    if choices:
100        results = [r[0] for r in rank_instruct(prompts, choices)]
101    else:
102        results = generate(prompts, max_tokens=config["max_tokens"], topk=1)
103
104    return results[0] if isinstance(prompt, str) else results
105
106
107@overload
108def embed(doc: list) -> list:
109    ...
110
111
112@overload
113def embed(doc: str) -> str:
114    ...
115
116
117def embed(doc):
118    """Create embedding for a document
119
120    :param doc: Document(s) to embed
121    :return: Embedding
122
123    Note that this function is overloaded to return a list of embeddings if
124    a list if of docs is provided and a single embedding if a single
125    doc is provided as a string
126
127    Examples:
128
129    >>> embed("Hello, world")
130    [-0.0...]
131
132    >>> embed(["Hello", "world"])
133    [[-0.0...]]
134    """
135
136    docs = [doc] if isinstance(doc, str) else doc
137
138    # Create embeddings and convert to lists of floats
139    emb = [[float(n) for n in e] for e in embeddings.embed(docs)]
140
141    return emb[0] if isinstance(doc, str) else emb
142
143
144def chat(prompt: str) -> str:
145    """Get new message from chat-optimized language model
146
147    The `prompt` for this model is provided as a series of messages as a single
148    plain-text string. Several special tokens are used to delineate chat
149    messages.
150
151    - `system:` - Indicates the start of a system message providing
152    instructions about how the assistant should behave.
153    - `user:` - Indicates the start of a prompter (typically user)
154    message.
155    - `assistant:` - Indicates the start of an assistant message.
156
157    A complete prompt may look something like this:
158
159    ```
160    Assistant is helpful and harmless
161
162    User: What is the capital of Germany?
163
164    Assistant: The capital of Germany is Berlin.
165
166    User: How many people live there?
167
168    Assistant:
169    ```
170
171    The completion from the language model is returned.
172
173    :param message: Prompt using formatting described above
174    :return: Completion returned from the language model
175
176    Examples:
177
178    >>> response = chat('''
179    ...      System: Respond as a helpful assistant. It is 5:00pm.
180    ...
181    ...      User: What time is it?
182    ...
183    ...      Assistant:
184    ...      ''') # doctest: +SKIP
185    "It's 5:00pm."
186    """
187
188    messages = parse_chat(prompt)
189
190    # Suppress starts of all assistant messages to avoid repeat generation
191    suppress = [
192        "Assistant: " + m["content"].split(" ")[0]
193        for m in messages
194        if m["role"] in ["assistant", "user"]
195    ]
196
197    # Suppress all user messages to avoid repeating them
198    suppress += [m["content"] for m in messages if m["role"] == "user"]
199
200    system_msgs = [m for m in messages if m["role"] == "system"]
201    assistant_msgs = [m for m in messages if m["role"] == "assistant"]
202    user_msgs = [m for m in messages if m["role"] == "user"]
203
204    # The current model is tuned on instructions and tends to get
205    # lost if it sees too many questions
206    # Use only the most recent user and assistant message for context
207    # Keep all system messages
208    messages = system_msgs + assistant_msgs[-1:] + user_msgs[-1:]
209
210    rolemap = {
211        "system": "System",
212        "user": "Question",
213        "assistant": "Assistant",
214    }
215
216    messages = [f"{rolemap[m['role']]}: {m['content']}" for m in messages]
217
218    prompt = "\n\n".join(messages) + "\n\n" + "Assistant:"
219
220    if prompt.startswith("System:"):
221        prompt = prompt[7:].strip()
222
223    response = generate(
224        [prompt],
225        max_tokens=config["max_tokens"],
226        temperature=0.3,
227        topk=40,
228        prefix="Assistant:",
229        suppress=suppress,
230    )[0]
231
232    # Remove duplicate assistant being generated
233    if response.startswith("Assistant:"):
234        response = response[10:]
235
236    return response.strip()
237
238
239def code(prompt: str) -> str:
240    """Complete a code prompt
241
242    This assumes that users are expecting Python completions. Default models
243    are fine-tuned on Python where applicable.
244
245    :param prompt: Code context to complete
246    :return: Completion returned from the language model
247
248    Examples:
249
250    >>> code("# Print Hello, world!\\n")
251    'print("Hello, world!")\\n'
252
253    >>> code("def return_4():")
254    '...return 4...'
255    """
256    return generate([prompt], max_tokens=config["max_tokens"], topk=1, model="code")[0]
257
258
259def extract_answer(question: str, context: str) -> str:
260    """Extract an answer to a `question` from a provided `context`
261
262    :param question: A question to answer using knowledge from context
263    :param context: Knowledge used to answer the question
264    :return: Answer to the question.
265
266    Examples:
267
268    >>> context = "There is a green ball and a red box"
269    >>> extract_answer("What color is the ball?", context).lower()
270    '...green...'
271
272    >>> extract_answer("Who created Python?", get_wiki('Python')) #doctest: +SKIP
273    '...Guido van Rossum...'
274    """
275
276    return generate([f"{context}\n\n{question}"])[0]
277
278
279def classify(doc: str, label1: str, label2: str) -> str:
280    """Performs binary classification on an input
281
282    :param doc: A plain text input document to classify
283    :param label1: The first label to classify against
284    :param label2: The second label to classify against
285    :return: The closest matching class. The return value will always be
286    `label1` or `label2`
287
288    Examples:
289
290    >>> classify("That book was good.","positive","negative")
291    'positive'
292    >>> classify("That movie was terrible.","positive","negative")
293    'negative'
294    """
295
296    return do(
297        f"Classify as {label1} or {label2}: {doc}\n\nClassification:",
298        choices=[label1, label2],
299    )
300
301
302def store_doc(doc: str, name: str = "") -> None:
303    """Store document for later retrieval
304
305    :param doc: A plain text document to store.
306    :param name: Optional name for the document. This is used as a chunk prefix.
307
308    Examples:
309
310    >>> store_doc("The sky is blue.")
311    """
312    docs.store(doc, name)
313
314
315def load_doc(query: str) -> str:
316    """Load a matching document
317
318    A single document that best matches `query` will be returned.
319
320    :param query: Query to compare to stored documents
321    :return: Content of the closest matching document
322
323    Examples:
324
325    >>> store_doc("Paris is in France.")
326    >>> store_doc("The sky is blue.")
327    >>> load_doc("Where is Paris?")
328    'Paris is in France.'
329    """
330    return docs.get_match(query)
331
332
333def get_doc_context(query: str) -> str:
334    """Loads context from documents
335
336    A string representing the most relevant content from all stored documents
337    will be returned. This may be a blend of chunks from multiple documents.
338
339    :param query: Query to compare to stored documents
340    :return: Up to 128 tokens of context
341
342    Examples:
343
344    >>> store_doc("Paris is in France.")
345    >>> store_doc("Paris is nice.")
346    >>> store_doc("The sky is blue.")
347    >>> get_doc_context("Where is Paris?")
348    'Paris is in France.\\n\\nParis is nice.'
349    """
350    return docs.get_context(query)
351
352
353def get_web(url: str) -> str:
354    """
355    Return the text of paragraphs from a web page
356
357    :param url: The URL to load
358    :return str: Plain text content from the URL
359
360    Note that it is difficult to return only the human-readable
361    content from an HTML page. This function takes a basic and quick
362    approach. It will not work perfectly on all sites, but will
363    often do a reasonable job of returning the plain text content
364    of a page.
365
366    If the `url` points to a plain text page, the page content
367    will be returned verbatim.
368    """
369
370    res = requests.get(
371        url, headers={"User-Agent": "Mozilla/5.0 (compatible; languagemodels)"}
372    )
373
374    if "text/plain" in res.raw.getheader("content-type"):
375        return res.text
376    elif "text/html" in res.raw.getheader("content-type"):
377        return get_html_paragraphs(res.text)
378
379    return ""
380
381
382def get_wiki(topic: str) -> str:
383    """
384    Return Wikipedia summary for a topic
385
386    This function ignores the complexity of disambiguation pages and simply
387    returns the first result that is not a disambiguation page
388
389    :param topic: Topic to search for on Wikipedia
390    :return: Text content of the lead section of the most popular matching article
391
392    Examples:
393
394    >>> get_wiki('Python language')
395    'Python is a high-level...'
396
397    >>> get_wiki('Chemistry')
398    'Chemistry is the scientific study...'
399    """
400
401    url = "https://api.wikimedia.org/core/v1/wikipedia/en/search/title"
402    response = requests.get(url, params={"q": topic, "limit": 5})
403    response = json.loads(response.text)
404
405    for page in response["pages"]:
406        wiki_result = requests.get(
407            f"https://en.wikipedia.org/w/api.php?action=query&prop=extracts|pageprops&"
408            f"exintro&redirects=1&titles={page['title']}&format=json"
409        ).json()
410
411        first = wiki_result["query"]["pages"].popitem()[1]
412        if "disambiguation" in first["pageprops"]:
413            continue
414
415        summary = first["extract"]
416
417        cutoffs = [
418            "See_also",
419            "Notes",
420            "References",
421            "Further_reading",
422            "External_links",
423        ]
424
425        for cutoff in cutoffs:
426            summary = summary.split(f'<span id="{cutoff}">', 1)[0]
427
428        summary = re.sub(r"<p>", "\n\n", summary, flags=re.I)
429        summary = re.sub(r"<!\-\-.*?\-\->", "", summary, flags=re.I | re.DOTALL)
430        summary = re.sub(r"<.*?>", "", summary, flags=re.I)
431        summary = re.sub(r"\s*[\n\r]+\s*[\r\n]+[\s\r\n]*", "\n\n", summary, flags=re.I)
432        summary = summary.strip()
433        return summary
434    else:
435        return "No matching wiki page found."
436
437
438def get_weather(latitude, longitude):
439    """Fetch the current weather for a supplied longitude and latitude
440
441    Weather is provided by the US government and this function only supports
442    locations in the United States.
443
444    :param latitude: Latitude value representing this location
445    :param longitude: Longitude value representing this location
446    :return: Plain text description of the current weather forecast
447
448    Examples:
449
450    >>> get_weather(41.8, -87.6) # doctest: +SKIP
451    'Scattered showers and thunderstorms before 1pm with a high of 73.'
452    """
453
454    res = requests.get(f"https://api.weather.gov/points/{latitude},{longitude}")
455    points = json.loads(res.text)
456    forecast_url = points["properties"]["forecast"]
457
458    res = requests.get(forecast_url)
459    forecast = json.loads(res.text)
460    current = forecast["properties"]["periods"][0]
461
462    return current["detailedForecast"]
463
464
465def get_date() -> str:
466    """Returns the current date and time in natural language
467
468    >>> get_date() # doctest: +SKIP
469    'Friday, May 12, 2023 at 09:27AM'
470    """
471
472    now = datetime.datetime.now()
473
474    return now.strftime("%A, %B %d, %Y at %I:%M%p")
475
476
477def print_tokens(prompt: str) -> None:
478    """Prints a list of tokens in a prompt
479
480    :param prompt: Prompt to use as input to tokenizer
481    :return: Nothing
482
483    Examples:
484
485    >>> print_tokens("Hello world")
486    ' Hello' (token 8774)
487    ' world' (token 296)
488
489    >>> print_tokens("Hola mundo")
490    ' Hol' (token 5838)
491    'a' (token 9)
492    ' mun' (token 13844)
493    'd' (token 26)
494    'o' (token 32)
495    """
496
497    tokens = list_tokens(prompt)
498
499    for token in tokens:
500        print(f"'{token[0].replace('▁', ' ')}' (token {token[1]})")
501
502
503def count_tokens(prompt: str) -> None:
504    """Counts tokens in a prompt
505
506    :param prompt: Prompt to use as input to tokenizer
507    :return: Nothing
508
509    Examples:
510
511    >>> count_tokens("Hello world")
512    2
513
514    >>> count_tokens("Hola mundo")
515    5
516    """
517
518    return len(list_tokens(prompt))
519
520
521def set_max_ram(value):
522    """Sets max allowed RAM
523
524    This value takes priority over environment variables
525
526    Returns the numeric value set in GB
527
528    >>> set_max_ram(16)
529    16.0
530
531    >>> set_max_ram('512mb')
532    0.5
533    """
534
535    config["max_ram"] = value
536
537    return config["max_ram"]
538
539
540def require_model_license(match_re):
541    """Require models to match supplied regex
542
543    This can be used to enforce certain licensing constraints when using this
544    package.
545    """
546    config["model_license"] = match_re
def complete(prompt: str) -> str:
21def complete(prompt: str) -> str:
22    """Provide one completion for a given open-ended prompt
23
24    :param prompt: Prompt to use as input to the model
25    :return: Completion returned from the language model
26
27    Examples:
28
29    >>> complete("Luke thought that he") #doctest: +SKIP
30    'was going to be a doctor.'
31
32    >>> complete("There are many mythical creatures who") #doctest: +SKIP
33    'are able to fly'
34
35    >>> complete("She hid in her room until") #doctest: +SKIP
36    'she was sure she was safe'
37    """
38
39    result = generate(
40        ["Write a sentence"],
41        prefix=prompt,
42        max_tokens=config["max_tokens"],
43        temperature=0.7,
44        topk=40,
45    )[0]
46
47    if result.startswith(prompt):
48        prefix_length = len(prompt)
49        return result[prefix_length:]
50    else:
51        return result

Provide one completion for a given open-ended prompt

Parameters
  • prompt: Prompt to use as input to the model
Returns

Completion returned from the language model

Examples:

>>> complete("Luke thought that he") #doctest: +SKIP
'was going to be a doctor.'
>>> complete("There are many mythical creatures who") #doctest: +SKIP
'are able to fly'
>>> complete("She hid in her room until") #doctest: +SKIP
'she was sure she was safe'
def do(prompt, choices=None):
 64def do(prompt, choices=None):
 65    """Follow a single-turn instructional prompt
 66
 67    :param prompt: Instructional prompt(s) to follow
 68    :param choices: If provided, outputs are restricted to values in choices
 69    :return: Completion returned from the language model
 70
 71    Note that this function is overloaded to return a list of results if
 72    a list if of prompts is provided and a single string if a single
 73    prompt is provided as a string
 74
 75    Examples:
 76
 77    >>> do("Translate Spanish to English: Hola mundo!") #doctest: +SKIP
 78    'Hello world!'
 79
 80    >>> do("Pick the planet from the list: baseball, Texas, Saturn")
 81    '...Saturn...'
 82
 83    >>> do(["Pick the planet from the list: baseball, Texas, Saturn"] * 2)
 84    ['...Saturn...', '...Saturn...']
 85
 86    >>> do(["Say red", "Say blue"], choices=["red", "blue"])
 87    ['red', 'blue']
 88
 89    >>> do("Classify as positive or negative: LLMs are bad",
 90    ... choices=["Positive", "Negative"])
 91    'Negative'
 92
 93    >>> do("Classify as positive or negative: LLMs are great",
 94    ... choices=["Positive", "Negative"])
 95    'Positive'
 96    """
 97
 98    prompts = [prompt] if isinstance(prompt, str) else prompt
 99
100    if choices:
101        results = [r[0] for r in rank_instruct(prompts, choices)]
102    else:
103        results = generate(prompts, max_tokens=config["max_tokens"], topk=1)
104
105    return results[0] if isinstance(prompt, str) else results

Follow a single-turn instructional prompt

Parameters
  • prompt: Instructional prompt(s) to follow
  • choices: If provided, outputs are restricted to values in choices
Returns

Completion returned from the language model

Note that this function is overloaded to return a list of results if a list if of prompts is provided and a single string if a single prompt is provided as a string

Examples:

>>> do("Translate Spanish to English: Hola mundo!") #doctest: +SKIP
'Hello world!'
>>> do("Pick the planet from the list: baseball, Texas, Saturn")
'...Saturn...'
>>> do(["Pick the planet from the list: baseball, Texas, Saturn"] * 2)
['...Saturn...', '...Saturn...']
>>> do(["Say red", "Say blue"], choices=["red", "blue"])
['red', 'blue']
>>> do("Classify as positive or negative: LLMs are bad",
... choices=["Positive", "Negative"])
'Negative'
>>> do("Classify as positive or negative: LLMs are great",
... choices=["Positive", "Negative"])
'Positive'
def embed(doc):
118def embed(doc):
119    """Create embedding for a document
120
121    :param doc: Document(s) to embed
122    :return: Embedding
123
124    Note that this function is overloaded to return a list of embeddings if
125    a list if of docs is provided and a single embedding if a single
126    doc is provided as a string
127
128    Examples:
129
130    >>> embed("Hello, world")
131    [-0.0...]
132
133    >>> embed(["Hello", "world"])
134    [[-0.0...]]
135    """
136
137    docs = [doc] if isinstance(doc, str) else doc
138
139    # Create embeddings and convert to lists of floats
140    emb = [[float(n) for n in e] for e in embeddings.embed(docs)]
141
142    return emb[0] if isinstance(doc, str) else emb

Create embedding for a document

Parameters
  • doc: Document(s) to embed
Returns

Embedding

Note that this function is overloaded to return a list of embeddings if a list if of docs is provided and a single embedding if a single doc is provided as a string

Examples:

>>> embed("Hello, world")
[-0.0...]
>>> embed(["Hello", "world"])
[[-0.0...]]
def chat(prompt: str) -> str:
145def chat(prompt: str) -> str:
146    """Get new message from chat-optimized language model
147
148    The `prompt` for this model is provided as a series of messages as a single
149    plain-text string. Several special tokens are used to delineate chat
150    messages.
151
152    - `system:` - Indicates the start of a system message providing
153    instructions about how the assistant should behave.
154    - `user:` - Indicates the start of a prompter (typically user)
155    message.
156    - `assistant:` - Indicates the start of an assistant message.
157
158    A complete prompt may look something like this:
159
160    ```
161    Assistant is helpful and harmless
162
163    User: What is the capital of Germany?
164
165    Assistant: The capital of Germany is Berlin.
166
167    User: How many people live there?
168
169    Assistant:
170    ```
171
172    The completion from the language model is returned.
173
174    :param message: Prompt using formatting described above
175    :return: Completion returned from the language model
176
177    Examples:
178
179    >>> response = chat('''
180    ...      System: Respond as a helpful assistant. It is 5:00pm.
181    ...
182    ...      User: What time is it?
183    ...
184    ...      Assistant:
185    ...      ''') # doctest: +SKIP
186    "It's 5:00pm."
187    """
188
189    messages = parse_chat(prompt)
190
191    # Suppress starts of all assistant messages to avoid repeat generation
192    suppress = [
193        "Assistant: " + m["content"].split(" ")[0]
194        for m in messages
195        if m["role"] in ["assistant", "user"]
196    ]
197
198    # Suppress all user messages to avoid repeating them
199    suppress += [m["content"] for m in messages if m["role"] == "user"]
200
201    system_msgs = [m for m in messages if m["role"] == "system"]
202    assistant_msgs = [m for m in messages if m["role"] == "assistant"]
203    user_msgs = [m for m in messages if m["role"] == "user"]
204
205    # The current model is tuned on instructions and tends to get
206    # lost if it sees too many questions
207    # Use only the most recent user and assistant message for context
208    # Keep all system messages
209    messages = system_msgs + assistant_msgs[-1:] + user_msgs[-1:]
210
211    rolemap = {
212        "system": "System",
213        "user": "Question",
214        "assistant": "Assistant",
215    }
216
217    messages = [f"{rolemap[m['role']]}: {m['content']}" for m in messages]
218
219    prompt = "\n\n".join(messages) + "\n\n" + "Assistant:"
220
221    if prompt.startswith("System:"):
222        prompt = prompt[7:].strip()
223
224    response = generate(
225        [prompt],
226        max_tokens=config["max_tokens"],
227        temperature=0.3,
228        topk=40,
229        prefix="Assistant:",
230        suppress=suppress,
231    )[0]
232
233    # Remove duplicate assistant being generated
234    if response.startswith("Assistant:"):
235        response = response[10:]
236
237    return response.strip()

Get new message from chat-optimized language model

The prompt for this model is provided as a series of messages as a single plain-text string. Several special tokens are used to delineate chat messages.

  • system: - Indicates the start of a system message providing instructions about how the assistant should behave.
  • user: - Indicates the start of a prompter (typically user) message.
  • assistant: - Indicates the start of an assistant message.

A complete prompt may look something like this:

Assistant is helpful and harmless

User: What is the capital of Germany?

Assistant: The capital of Germany is Berlin.

User: How many people live there?

Assistant:

The completion from the language model is returned.

Parameters
  • message: Prompt using formatting described above
Returns

Completion returned from the language model

Examples:

>>> response = chat('''
...      System: Respond as a helpful assistant. It is 5:00pm.
...
...      User: What time is it?
...
...      Assistant:
...      ''') # doctest: +SKIP
"It's 5:00pm."
def code(prompt: str) -> str:
240def code(prompt: str) -> str:
241    """Complete a code prompt
242
243    This assumes that users are expecting Python completions. Default models
244    are fine-tuned on Python where applicable.
245
246    :param prompt: Code context to complete
247    :return: Completion returned from the language model
248
249    Examples:
250
251    >>> code("# Print Hello, world!\\n")
252    'print("Hello, world!")\\n'
253
254    >>> code("def return_4():")
255    '...return 4...'
256    """
257    return generate([prompt], max_tokens=config["max_tokens"], topk=1, model="code")[0]

Complete a code prompt

This assumes that users are expecting Python completions. Default models are fine-tuned on Python where applicable.

Parameters
  • prompt: Code context to complete
Returns

Completion returned from the language model

Examples:

>>> code("# Print Hello, world!\n")
'print("Hello, world!")\n'
>>> code("def return_4():")
'...return 4...'
def extract_answer(question: str, context: str) -> str:
260def extract_answer(question: str, context: str) -> str:
261    """Extract an answer to a `question` from a provided `context`
262
263    :param question: A question to answer using knowledge from context
264    :param context: Knowledge used to answer the question
265    :return: Answer to the question.
266
267    Examples:
268
269    >>> context = "There is a green ball and a red box"
270    >>> extract_answer("What color is the ball?", context).lower()
271    '...green...'
272
273    >>> extract_answer("Who created Python?", get_wiki('Python')) #doctest: +SKIP
274    '...Guido van Rossum...'
275    """
276
277    return generate([f"{context}\n\n{question}"])[0]

Extract an answer to a question from a provided context

Parameters
  • question: A question to answer using knowledge from context
  • context: Knowledge used to answer the question
Returns

Answer to the question.

Examples:

>>> context = "There is a green ball and a red box"
>>> extract_answer("What color is the ball?", context).lower()
'...green...'
>>> extract_answer("Who created Python?", get_wiki('Python')) #doctest: +SKIP
'...Guido van Rossum...'
def classify(doc: str, label1: str, label2: str) -> str:
280def classify(doc: str, label1: str, label2: str) -> str:
281    """Performs binary classification on an input
282
283    :param doc: A plain text input document to classify
284    :param label1: The first label to classify against
285    :param label2: The second label to classify against
286    :return: The closest matching class. The return value will always be
287    `label1` or `label2`
288
289    Examples:
290
291    >>> classify("That book was good.","positive","negative")
292    'positive'
293    >>> classify("That movie was terrible.","positive","negative")
294    'negative'
295    """
296
297    return do(
298        f"Classify as {label1} or {label2}: {doc}\n\nClassification:",
299        choices=[label1, label2],
300    )

Performs binary classification on an input

Parameters
  • doc: A plain text input document to classify
  • label1: The first label to classify against
  • label2: The second label to classify against
Returns

The closest matching class. The return value will always be label1 or label2

Examples:

>>> classify("That book was good.","positive","negative")
'positive'
>>> classify("That movie was terrible.","positive","negative")
'negative'
def store_doc(doc: str, name: str = '') -> None:
303def store_doc(doc: str, name: str = "") -> None:
304    """Store document for later retrieval
305
306    :param doc: A plain text document to store.
307    :param name: Optional name for the document. This is used as a chunk prefix.
308
309    Examples:
310
311    >>> store_doc("The sky is blue.")
312    """
313    docs.store(doc, name)

Store document for later retrieval

Parameters
  • doc: A plain text document to store.
  • name: Optional name for the document. This is used as a chunk prefix.

Examples:

>>> store_doc("The sky is blue.")
def load_doc(query: str) -> str:
316def load_doc(query: str) -> str:
317    """Load a matching document
318
319    A single document that best matches `query` will be returned.
320
321    :param query: Query to compare to stored documents
322    :return: Content of the closest matching document
323
324    Examples:
325
326    >>> store_doc("Paris is in France.")
327    >>> store_doc("The sky is blue.")
328    >>> load_doc("Where is Paris?")
329    'Paris is in France.'
330    """
331    return docs.get_match(query)

Load a matching document

A single document that best matches query will be returned.

Parameters
  • query: Query to compare to stored documents
Returns

Content of the closest matching document

Examples:

>>> store_doc("Paris is in France.")
>>> store_doc("The sky is blue.")
>>> load_doc("Where is Paris?")
'Paris is in France.'
def get_doc_context(query: str) -> str:
334def get_doc_context(query: str) -> str:
335    """Loads context from documents
336
337    A string representing the most relevant content from all stored documents
338    will be returned. This may be a blend of chunks from multiple documents.
339
340    :param query: Query to compare to stored documents
341    :return: Up to 128 tokens of context
342
343    Examples:
344
345    >>> store_doc("Paris is in France.")
346    >>> store_doc("Paris is nice.")
347    >>> store_doc("The sky is blue.")
348    >>> get_doc_context("Where is Paris?")
349    'Paris is in France.\\n\\nParis is nice.'
350    """
351    return docs.get_context(query)

Loads context from documents

A string representing the most relevant content from all stored documents will be returned. This may be a blend of chunks from multiple documents.

Parameters
  • query: Query to compare to stored documents
Returns

Up to 128 tokens of context

Examples:

>>> store_doc("Paris is in France.")
>>> store_doc("Paris is nice.")
>>> store_doc("The sky is blue.")
>>> get_doc_context("Where is Paris?")
'Paris is in France.\n\nParis is nice.'
def get_web(url: str) -> str:
354def get_web(url: str) -> str:
355    """
356    Return the text of paragraphs from a web page
357
358    :param url: The URL to load
359    :return str: Plain text content from the URL
360
361    Note that it is difficult to return only the human-readable
362    content from an HTML page. This function takes a basic and quick
363    approach. It will not work perfectly on all sites, but will
364    often do a reasonable job of returning the plain text content
365    of a page.
366
367    If the `url` points to a plain text page, the page content
368    will be returned verbatim.
369    """
370
371    res = requests.get(
372        url, headers={"User-Agent": "Mozilla/5.0 (compatible; languagemodels)"}
373    )
374
375    if "text/plain" in res.raw.getheader("content-type"):
376        return res.text
377    elif "text/html" in res.raw.getheader("content-type"):
378        return get_html_paragraphs(res.text)
379
380    return ""

Return the text of paragraphs from a web page

Parameters
  • url: The URL to load
Returns

Plain text content from the URL

Note that it is difficult to return only the human-readable content from an HTML page. This function takes a basic and quick approach. It will not work perfectly on all sites, but will often do a reasonable job of returning the plain text content of a page.

If the url points to a plain text page, the page content will be returned verbatim.

def get_wiki(topic: str) -> str:
383def get_wiki(topic: str) -> str:
384    """
385    Return Wikipedia summary for a topic
386
387    This function ignores the complexity of disambiguation pages and simply
388    returns the first result that is not a disambiguation page
389
390    :param topic: Topic to search for on Wikipedia
391    :return: Text content of the lead section of the most popular matching article
392
393    Examples:
394
395    >>> get_wiki('Python language')
396    'Python is a high-level...'
397
398    >>> get_wiki('Chemistry')
399    'Chemistry is the scientific study...'
400    """
401
402    url = "https://api.wikimedia.org/core/v1/wikipedia/en/search/title"
403    response = requests.get(url, params={"q": topic, "limit": 5})
404    response = json.loads(response.text)
405
406    for page in response["pages"]:
407        wiki_result = requests.get(
408            f"https://en.wikipedia.org/w/api.php?action=query&prop=extracts|pageprops&"
409            f"exintro&redirects=1&titles={page['title']}&format=json"
410        ).json()
411
412        first = wiki_result["query"]["pages"].popitem()[1]
413        if "disambiguation" in first["pageprops"]:
414            continue
415
416        summary = first["extract"]
417
418        cutoffs = [
419            "See_also",
420            "Notes",
421            "References",
422            "Further_reading",
423            "External_links",
424        ]
425
426        for cutoff in cutoffs:
427            summary = summary.split(f'<span id="{cutoff}">', 1)[0]
428
429        summary = re.sub(r"<p>", "\n\n", summary, flags=re.I)
430        summary = re.sub(r"<!\-\-.*?\-\->", "", summary, flags=re.I | re.DOTALL)
431        summary = re.sub(r"<.*?>", "", summary, flags=re.I)
432        summary = re.sub(r"\s*[\n\r]+\s*[\r\n]+[\s\r\n]*", "\n\n", summary, flags=re.I)
433        summary = summary.strip()
434        return summary
435    else:
436        return "No matching wiki page found."

Return Wikipedia summary for a topic

This function ignores the complexity of disambiguation pages and simply returns the first result that is not a disambiguation page

Parameters
  • topic: Topic to search for on Wikipedia
Returns

Text content of the lead section of the most popular matching article

Examples:

>>> get_wiki('Python language')
'Python is a high-level...'
>>> get_wiki('Chemistry')
'Chemistry is the scientific study...'
def get_weather(latitude, longitude):
439def get_weather(latitude, longitude):
440    """Fetch the current weather for a supplied longitude and latitude
441
442    Weather is provided by the US government and this function only supports
443    locations in the United States.
444
445    :param latitude: Latitude value representing this location
446    :param longitude: Longitude value representing this location
447    :return: Plain text description of the current weather forecast
448
449    Examples:
450
451    >>> get_weather(41.8, -87.6) # doctest: +SKIP
452    'Scattered showers and thunderstorms before 1pm with a high of 73.'
453    """
454
455    res = requests.get(f"https://api.weather.gov/points/{latitude},{longitude}")
456    points = json.loads(res.text)
457    forecast_url = points["properties"]["forecast"]
458
459    res = requests.get(forecast_url)
460    forecast = json.loads(res.text)
461    current = forecast["properties"]["periods"][0]
462
463    return current["detailedForecast"]

Fetch the current weather for a supplied longitude and latitude

Weather is provided by the US government and this function only supports locations in the United States.

Parameters
  • latitude: Latitude value representing this location
  • longitude: Longitude value representing this location
Returns

Plain text description of the current weather forecast

Examples:

>>> get_weather(41.8, -87.6) # doctest: +SKIP
'Scattered showers and thunderstorms before 1pm with a high of 73.'
def get_date() -> str:
466def get_date() -> str:
467    """Returns the current date and time in natural language
468
469    >>> get_date() # doctest: +SKIP
470    'Friday, May 12, 2023 at 09:27AM'
471    """
472
473    now = datetime.datetime.now()
474
475    return now.strftime("%A, %B %d, %Y at %I:%M%p")

Returns the current date and time in natural language

>>> get_date() # doctest: +SKIP
'Friday, May 12, 2023 at 09:27AM'
def count_tokens(prompt: str) -> None:
504def count_tokens(prompt: str) -> None:
505    """Counts tokens in a prompt
506
507    :param prompt: Prompt to use as input to tokenizer
508    :return: Nothing
509
510    Examples:
511
512    >>> count_tokens("Hello world")
513    2
514
515    >>> count_tokens("Hola mundo")
516    5
517    """
518
519    return len(list_tokens(prompt))

Counts tokens in a prompt

Parameters
  • prompt: Prompt to use as input to tokenizer
Returns

Nothing

Examples:

>>> count_tokens("Hello world")
2
>>> count_tokens("Hola mundo")
5
def set_max_ram(value):
522def set_max_ram(value):
523    """Sets max allowed RAM
524
525    This value takes priority over environment variables
526
527    Returns the numeric value set in GB
528
529    >>> set_max_ram(16)
530    16.0
531
532    >>> set_max_ram('512mb')
533    0.5
534    """
535
536    config["max_ram"] = value
537
538    return config["max_ram"]

Sets max allowed RAM

This value takes priority over environment variables

Returns the numeric value set in GB

>>> set_max_ram(16)
16.0
>>> set_max_ram('512mb')
0.5
def require_model_license(match_re):
541def require_model_license(match_re):
542    """Require models to match supplied regex
543
544    This can be used to enforce certain licensing constraints when using this
545    package.
546    """
547    config["model_license"] = match_re

Require models to match supplied regex

This can be used to enforce certain licensing constraints when using this package.