numbers.py (7621B)
1 from talon import Context, Module, actions 2 from typing import List, Optional, Union, Iterator 3 4 mod = Module() 5 ctx = Context() 6 7 digits = "zero one two three four five six seven eight nine".split() 8 teens = "eleven twelve thirteen fourteen fifteen sixteen seventeen eighteen nineteen".split() 9 tens = "ten twenty thirty forty fifty sixty seventy eighty ninety".split() 10 scales = "hundred thousand million billion trillion quadrillion quintillion sextillion septillion octillion nonillion decillion".split() 11 12 digits_map = {n: i for i, n in enumerate(digits)} 13 digits_map["oh"] = 0 14 teens_map = {n: i + 11 for i, n in enumerate(teens)} 15 tens_map = {n: 10 * (i + 1) for i, n in enumerate(tens)} 16 scales_map = {n: 10 ** (3 * (i+1)) for i, n in enumerate(scales[1:])} 17 scales_map["hundred"] = 100 18 19 numbers_map = digits_map.copy() 20 numbers_map.update(teens_map) 21 numbers_map.update(tens_map) 22 numbers_map.update(scales_map) 23 24 def parse_number(l: List[str]) -> str: 25 """Parses a list of words into a number/digit string.""" 26 l = list(scan_small_numbers(l)) 27 for scale in scales: 28 l = parse_scale(scale, l) 29 return "".join(str(n) for n in l) 30 31 def scan_small_numbers(l: List[str]) -> Iterator[Union[str,int]]: 32 """ 33 Takes a list of number words, yields a generator of mixed numbers & strings. 34 Translates small number terms (<100) into corresponding numbers. 35 Drops all occurrences of "and". 36 Smashes digits onto tens words, eg. ["twenty", "one"] -> [21]. 37 But note that "ten" and "zero" are excluded, ie: 38 ["ten", "three"] -> [10, 3] 39 ["fifty", "zero"] -> [50, 0] 40 Does nothing to scale words ("hundred", "thousand", "million", etc). 41 """ 42 # reversed so that repeated pop() visits in left-to-right order 43 l = [x for x in reversed(l) if x != "and"] 44 while l: 45 n = l.pop() 46 # fuse tens onto digits, eg. "twenty", "one" -> 21 47 if n in tens_map and n != "ten" and l and digits_map.get(l[-1], 0) != 0: 48 d = l.pop() 49 yield numbers_map[n] + numbers_map[d] 50 # turn small number terms into corresponding numbers 51 elif n not in scales_map: 52 yield numbers_map[n] 53 else: 54 yield n 55 56 def parse_scale(scale: str, l: List[Union[str,int]]) -> List[Union[str,int]]: 57 """Parses a list of mixed numbers & strings for occurrences of the following 58 pattern: 59 60 <multiplier> <scale> <remainder> 61 62 where <scale> is a scale word like "hundred", "thousand", "million", etc and 63 multiplier and remainder are numbers or strings of numbers of the 64 appropriate size. For example: 65 66 parse_scale("hundred", [1, "hundred", 2]) -> [102] 67 parse_scale("thousand", [12, "thousand", 3, 45]) -> [12345] 68 69 We assume that all scales of lower magnitude have already been parsed; don't 70 call parse_scale("thousand") until you've called parse_scale("hundred"). 71 """ 72 scale_value = scales_map[scale] 73 scale_digits = len(str(scale_value)) 74 75 # Split the list on the desired scale word, then parse from left to right. 76 left, *splits = split_list(scale, l) 77 for right in splits: 78 # (1) Figure out the multiplier by looking to the left of the scale 79 # word. We ignore non-integers because they are scale words that we 80 # haven't processed yet; this strategy means that "thousand hundred" 81 # gets parsed as 1,100 instead of 100,000, but "hundred thousand" is 82 # parsed correctly as 100,000. 83 before = 1 # default multiplier 84 if left and isinstance(left[-1], int) and left[-1] != 0: 85 before = left.pop() 86 87 # (2) Absorb numbers to the right, eg. in [1, "thousand", 1, 26], "1 88 # thousand" absorbs ["1", "26"] to make 1,126. We pull numbers off 89 # `right` until we fill up the desired number of digits. 90 after = "" 91 while right and isinstance(right[0], int): 92 next = after + str(right[0]) 93 if len(next) >= scale_digits: break 94 after = next 95 right.pop(0) 96 after = int(after) if after else 0 97 98 # (3) Push the parsed number into place, append whatever was left 99 # unparsed, and continue. 100 left.append(before * scale_value + after) 101 left.extend(right) 102 103 return left 104 105 def split_list(value, l: list) -> Iterator: 106 """Splits a list by occurrences of a given value.""" 107 start = 0 108 while True: 109 try: i = l.index(value, start) 110 except ValueError: break 111 yield l[start:i] 112 start = i+1 113 yield l[start:] 114 115 116 # # ---------- TESTS (uncomment to run) ---------- 117 # def test_number(expected, string): 118 # print('testing:', string) 119 # l = list(scan_small_numbers(string.split())) 120 # print(" scan --->", l) 121 # for scale in scales: 122 # old = l 123 # l = parse_scale(scale, l) 124 # if scale in old: print(" parse -->", l) 125 # else: assert old == l, "parse_scale should do nothing if the scale does not occur in the list" 126 # result = "".join(str(n) for n in l) 127 # assert result == parse_number(string.split()) 128 # assert str(expected) == result, f"parsing {string!r}, expected {expected}, got {result}" 129 130 # test_number(105000, "one hundred and five thousand") 131 # test_number(1000000, "one thousand thousand") 132 # test_number(1501000, "one million five hundred one thousand") 133 # test_number(1501106, "one million five hundred and one thousand one hundred and six") 134 # test_number(123, "one two three") 135 # test_number(123, "one twenty three") 136 # test_number(104, "ten four") # borderline, but valid in some dialects 137 # test_number(1066, "ten sixty six") # a common way of saying years 138 # test_number(1906, "nineteen oh six") # year 139 # test_number(2001, "twenty oh one") # year 140 # test_number(2020, "twenty twenty") 141 # test_number(1001, "one thousand one") 142 # test_number(1010, "one thousand ten") 143 # test_number(123456, "one hundred and twenty three thousand and four hundred and fifty six") 144 # test_number(123456, "one twenty three thousand four fifty six") 145 146 # ## failing (and somewhat debatable) tests from old numbers.py 147 # #test_number(10000011, "one million one one") 148 # #test_number(100001010, "one million ten ten") 149 # #test_number(1050006000, "one hundred thousand and five thousand and six thousand") 150 151 152 # ---------- CAPTURES ---------- 153 alt_digits = "(" + ("|".join(digits_map.keys())) + ")" 154 alt_teens = "(" + ("|".join(teens_map.keys())) + ")" 155 alt_tens = "(" + ("|".join(tens_map.keys())) + ")" 156 alt_scales = "(" + ("|".join(scales_map.keys())) + ")" 157 number_word = "(" + "|".join(numbers_map.keys()) + ")" 158 159 # TODO: allow things like "double eight" for 88 160 @ctx.capture("digit_string", rule=f"({alt_digits} | {alt_teens} | {alt_tens})+") 161 def digit_string(m) -> str: return parse_number(list(m)) 162 163 @ctx.capture("digits", rule="<digit_string>") 164 def digits(m) -> int: 165 """Parses a phrase representing a digit sequence, returning it as an integer.""" 166 return int(m.digit_string) 167 168 @mod.capture(rule=f"{number_word}+ (and {number_word}+)*") 169 def number_string(m) -> str: 170 """Parses a number phrase, returning that number as a string.""" 171 return parse_number(list(m)) 172 173 @ctx.capture("number", rule="<user.number_string>") 174 def number(m) -> int: 175 """Parses a number phrase, returning it as an integer.""" 176 return int(m.number_string) 177 178 @ctx.capture("number_signed", rule=f"[negative|minus] <number>") 179 def number_signed(m): 180 number = m[-1] 181 return -number if (m[0] in ["negative", "minus"]) else number 182 183 @ctx.capture( 184 "number_small", rule=f"({alt_digits} | {alt_teens} | {alt_tens} [{alt_digits}])" 185 ) 186 def number_small(m): return int(parse_number(list(m)))