1 files changed, 105 insertions, 81 deletions
diff --git a/users/drashna/keyrecords/autocorrection/make_autocorrection_data.py b/users/drashna/keyrecords/autocorrection/make_autocorrection_data.py
index 54fd9ba594..0dd9b78b9c 100755
--- a/users/drashna/keyrecords/autocorrection/make_autocorrection_data.py
+++ b/users/drashna/keyrecords/autocorrection/make_autocorrection_data.py
@@ -1,4 +1,4 @@
-# Copyright 2021 Google LLC
+# Copyright 2021-2022 Google LLC
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -29,6 +29,7 @@ Each line of the dict file defines one typo and its correction with the syntax
 Example:
 
   :thier        -> their
+  dosen't       -> doesn't
   fitler        -> filter
   lenght        -> length
   ouput         -> output
@@ -42,7 +43,7 @@ https://getreuer.info/posts/keyboards/autocorrection
 
 import sys
 import textwrap
-from typing import Any, Dict, List, Tuple
+from typing import Any, Dict, Iterator, List, Tuple
 
 try:
   from english_words import english_words_lower_alpha_set as CORRECT_WORDS
@@ -51,85 +52,67 @@ except ImportError:
         'correctly spelled word. To check for this, install the english_words '
         'package and rerun this script:\n\n  pip install english_words\n')
   # Use a minimal word list as a fallback.
-  CORRECT_WORDS = ('information', 'available', 'international', 'language',
-                   'loosest', 'reference', 'wealthier', 'entertainment',
-                   'association', 'provides', 'technology', 'statehood')
+  CORRECT_WORDS = ('apparent', 'association', 'available', 'classification',
+                   'effect', 'entertainment', 'fantastic', 'information',
+                   'integrate', 'international', 'language', 'loosest',
+                   'manual', 'nothing', 'provides', 'reference', 'statehood',
+                   'technology', 'virtually', 'wealthier', 'wonderful')
 
 KC_A = 4
 KC_SPC = 0x2c
+KC_QUOT = 0x34
+
+TYPO_CHARS = dict(
+  [
+    ("'", KC_QUOT),
+    (':', KC_SPC),  # "Word break" character.
+  ] +
+  # Characters a-z.
+  [(chr(c), c + KC_A - ord('a')) for c in range(ord('a'), ord('z') + 1)]
+)
+
 
 def parse_file(file_name: str) -> List[Tuple[str, str]]:
   """Parses autocorrections dictionary file.
 
   Each line of the file defines one typo and its correction with the syntax
   "typo -> correction". Blank lines or lines starting with '#' are ignored. The
-  function validates that typos only have characters a-z and that typos are not
-  substrings of other typos, otherwise the longer typo would never trigger.
+  function validates that typos only have characters in TYPO_CHARS, that
+  typos are not substrings of other typos, and checking that typos don't trigger
+  on CORRECT_WORDS.
 
   Args:
     file_name: String, path of the autocorrections dictionary.
   Returns:
     List of (typo, correction) tuples.
   """
-
+  correct_words = ('information', 'available', 'international', 'language', 'loosest', 'reference', 'wealthier', 'entertainment', 'association', 'provides', 'technology', 'statehood')
   autocorrections = []
   typos = set()
-  line_number = 0
-  for line in open(file_name, 'rt'):
-    line_number += 1
-    line = line.strip()
-    if line and line[0] != '#':
-      # Parse syntax "typo -> correction", using strip to ignore indenting.
-      tokens = [token.strip() for token in line.split('->', 1)]
-      if len(tokens) != 2 or not tokens[0]:
-        print(f'Error:{line_number}: Invalid syntax: "{line}"')
+  for line_number, typo, correction in parse_file_lines(file_name):
+    if typo in typos:
+      print(f'Warning:{line_number}: Ignoring duplicate typo: "{typo}"')
+      continue
+
+    # Check that `typo` is valid.
+    if not(all([c in TYPO_CHARS for c in typo])):
+      print(f'Error:{line_number}: Typo "{typo}" has '
+            'characters other than ' + ''.join(TYPO_CHARS.keys()))
+      sys.exit(1)
+    for other_typo in typos:
+      if typo in other_typo or other_typo in typo:
+        print(f'Error:{line_number}: Typos may not be substrings of one '
+              f'another, otherwise the longer typo would never trigger: '
+              f'"{typo}" vs. "{other_typo}".')
         sys.exit(1)
+    if len(typo) < 5:
+      print(f'Warning:{line_number}: It is suggested that typos are at '
+            f'least 5 characters long to avoid false triggers: "{typo}"')
 
-      typo, correction = tokens
-      typo = typo.lower()  # Force typos to lowercase.
-      typo = typo.replace(' ', ':')
-
-      if typo in typos:
-        print(f'Warning:{line_number}: Ignoring duplicate typo: "{typo}"')
-        continue
+    check_typo_against_dictionary(typo, line_number, correct_words)
 
-      # Check that `typo` is valid.
-      if not(all([ord('a') <= ord(c) <= ord('z') or c == ':' for c in typo])):
-        print(f'Error:{line_number}: Typo "{typo}" has '
-              'characters other than a-z and :.')
-        sys.exit(1)
-      for other_typo in typos:
-        if typo in other_typo or other_typo in typo:
-          print(f'Error:{line_number}: Typos may not be substrings of one '
-                f'another, otherwise the longer typo would never trigger: '
-                f'"{typo}" vs. "{other_typo}".')
-          sys.exit(1)
-      if len(typo) < 5:
-        print(f'Warning:{line_number}: It is suggested that typos are at '
-              f'least 5 characters long to avoid false triggers: "{typo}"')
-
-      if typo.startswith(':') and typo.endswith(':'):
-        if typo[1:-1] in CORRECT_WORDS:
-          print(f'Warning:{line_number}: Typo "{typo}" is a correctly spelled '
-                'dictionary word.')
-      elif typo.startswith(':') and not typo.endswith(':'):
-        for word in CORRECT_WORDS:
-          if word.startswith(typo[1:]):
-            print(f'Warning:{line_number}: Typo "{typo}" would falsely trigger '
-                  f'on correctly spelled word "{word}".')
-      elif not typo.startswith(':') and typo.endswith(':'):
-        for word in CORRECT_WORDS:
-          if word.endswith(typo[:-1]):
-            print(f'Warning:{line_number}: Typo "{typo}" would falsely trigger '
-                  f'on correctly spelled word "{word}".')
-      elif not typo.startswith(':') and not typo.endswith(':'):
-        for word in CORRECT_WORDS:
-          if typo in word:
-            print(f'Warning:{line_number}: Typo "{typo}" would falsely trigger '
-                  f'on correctly spelled word "{word}".')
-
-      autocorrections.append((typo, correction))
-      typos.add(typo)
+    autocorrections.append((typo, correction))
+    typos.add(typo)
 
   return autocorrections
 
@@ -152,6 +135,47 @@ def make_trie(autocorrections: List[Tuple[str, str]]) -> Dict[str, Any]:
   return trie
 
 
+def parse_file_lines(file_name: str) -> Iterator[Tuple[int, str, str]]:
+  """Parses lines read from `file_name` into typo-correction pairs."""
+
+  line_number = 0
+  for line in open(file_name, 'rt'):
+    line_number += 1
+    line = line.strip()
+    if line and line[0] != '#':
+      # Parse syntax "typo -> correction", using strip to ignore indenting.
+      tokens = [token.strip() for token in line.split('->', 1)]
+      if len(tokens) != 2 or not tokens[0]:
+        print(f'Error:{line_number}: Invalid syntax: "{line}"')
+        sys.exit(1)
+
+      typo, correction = tokens
+      typo = typo.lower()  # Force typos to lowercase.
+      typo = typo.replace(' ', ':')
+
+      yield line_number, typo, correction
+
+
+def check_typo_against_dictionary(typo: str, line_number: int, correct_words) -> None:
+  """Checks `typo` against English dictionary words."""
+
+  if typo.startswith(':') and typo.endswith(':'):
+    if typo[1:-1] in correct_words:
+      print(f'Warning:{line_number}: Typo "{typo}" is a correctly spelled dictionary word.')
+  elif typo.startswith(':') and not typo.endswith(':'):
+    for word in correct_words:
+      if word.startswith(typo[1:]):
+            print(f'Warning:{line_number}: Typo "{typo}" would falsely trigger on correctly spelled word "{word}".')
+  elif not typo.startswith(':') and typo.endswith(':'):
+    for word in correct_words:
+      if word.endswith(typo[:-1]):
+            print(f'Warning:{line_number}: Typo "{typo}" would falsely trigger on correctly spelled word "{word}".')
+  elif not typo.startswith(':') and not typo.endswith(':'):
+    for word in correct_words:
+      if typo in word:
+        print(f'Warning:{line_number}: Typo "{typo}" would falsely trigger on correctly spelled word "{word}".')
+
+
 def serialize_trie(autocorrections: List[Tuple[str, str]],
                    trie: Dict[str, Any]) -> List[int]:
   """Serializes trie and correction data in a form readable by the C code.
@@ -165,7 +189,7 @@ def serialize_trie(autocorrections: List[Tuple[str, str]],
   table = []
 
   # Traverse trie in depth first order.
-  def traverse(trie_node):
+  def traverse(trie_node: Dict[str, Any]) -> Dict[str, Any]:
     if 'LEAF' in trie_node:  # Handle a leaf trie node.
       typo, correction = trie_node['LEAF']
       word_boundary_ending = typo[-1] == ':'
@@ -200,37 +224,35 @@ def serialize_trie(autocorrections: List[Tuple[str, str]],
 
   traverse(trie)
 
-  def serialize(e):
-    def kc_code(c):
-      if ord('a') <= ord(c) <= ord('z'):
-        return ord(c) - ord('a') + KC_A
-      elif c == ':':
-        return KC_SPC
-      else:
-        raise ValueError(f'Invalid character: {c}')
-
-    encode_link = lambda link: [link['byte_offset'] & 255,
-                                link['byte_offset'] >> 8]
-
+  def serialize(e: Dict[str, Any]) -> List[int]:
     if not e['links']:  # Handle a leaf table entry.
       return e['data']
     elif len(e['links']) == 1:  # Handle a chain table entry.
-      return list(map(kc_code, e['chars'])) + [0] #+ encode_link(e['links'][0]))
+      return [TYPO_CHARS[c] for c in e['chars']] + [0]
     else:  # Handle a branch table entry.
       data = []
       for c, link in zip(e['chars'], e['links']):
-        data += [kc_code(c) | (0 if data else 64)] + encode_link(link)
+        data += [TYPO_CHARS[c] | (0 if data else 64)] + encode_link(link)
       return data + [0]
 
   byte_offset = 0
   for e in table:  # To encode links, first compute byte offset of each entry.
     e['byte_offset'] = byte_offset
     byte_offset += len(serialize(e))
-    assert 0 <= byte_offset <= 0xffff
 
   return [b for e in table for b in serialize(e)]  # Serialize final table.
 
 
+def encode_link(link: Dict[str, Any]) -> List[int]:
+  """Encodes a node link as two bytes."""
+  byte_offset = link['byte_offset']
+  if not (0 <= byte_offset <= 0xffff):
+    print('Error: The autocorrection table is too large, a node link exceeds '
+          '64KB limit. Try reducing the autocorrection dict to fewer entries.')
+    sys.exit(1)
+  return [byte_offset & 255, byte_offset >> 8]
+
+
 def write_generated_code(autocorrections: List[Tuple[str, str]],
                          data: List[int],
                          file_name: str) -> None:
@@ -242,7 +264,10 @@ def write_generated_code(autocorrections: List[Tuple[str, str]],
     file_name: String, path of the output C file.
   """
   assert all(0 <= b <= 255 for b in data)
-  typo_len = lambda e: len(e[0])
+
+  def typo_len(e: Tuple[str, str]) -> int:
+    return len(e[0])
+
   min_typo = min(autocorrections, key=typo_len)[0]
   max_typo = max(autocorrections, key=typo_len)[0]
   generated_code = ''.join([
@@ -252,9 +277,8 @@ def write_generated_code(autocorrections: List[Tuple[str, str]],
                    for typo, correction in autocorrections)),
     f'\n#define AUTOCORRECTION_MIN_LENGTH {len(min_typo)}  // "{min_typo}"\n',
     f'#define AUTOCORRECTION_MAX_LENGTH {len(max_typo)}  // "{max_typo}"\n\n',
-    f'#define DICTIONARY_SIZE {len(data)}\n\n',
-    textwrap.fill('static const uint8_t autocorrection_data[DICTIONARY_SIZE] PROGMEM = {%s};' % (
-      ', '.join(map(str, data))), width=120, subsequent_indent='    '),
+    textwrap.fill('static const uint8_t autocorrection_data[%d] PROGMEM = {%s};' % (
+      len(data), ', '.join(map(str, data))), width=80, subsequent_indent='  '),
     '\n\n'])
 
   with open(file_name, 'wt') as f: