diff options
Diffstat (limited to 'users/drashna/keyrecords/autocorrection/make_autocorrection_data.py')
-rwxr-xr-x | users/drashna/keyrecords/autocorrection/make_autocorrection_data.py | 186 |
1 files changed, 105 insertions, 81 deletions
diff --git a/users/drashna/keyrecords/autocorrection/make_autocorrection_data.py b/users/drashna/keyrecords/autocorrection/make_autocorrection_data.py index 54fd9ba594..0dd9b78b9c 100755 --- a/users/drashna/keyrecords/autocorrection/make_autocorrection_data.py +++ b/users/drashna/keyrecords/autocorrection/make_autocorrection_data.py @@ -1,4 +1,4 @@ -# Copyright 2021 Google LLC +# Copyright 2021-2022 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -29,6 +29,7 @@ Each line of the dict file defines one typo and its correction with the syntax Example: :thier -> their + dosen't -> doesn't fitler -> filter lenght -> length ouput -> output @@ -42,7 +43,7 @@ https://getreuer.info/posts/keyboards/autocorrection import sys import textwrap -from typing import Any, Dict, List, Tuple +from typing import Any, Dict, Iterator, List, Tuple try: from english_words import english_words_lower_alpha_set as CORRECT_WORDS @@ -51,85 +52,67 @@ except ImportError: 'correctly spelled word. To check for this, install the english_words ' 'package and rerun this script:\n\n pip install english_words\n') # Use a minimal word list as a fallback. - CORRECT_WORDS = ('information', 'available', 'international', 'language', - 'loosest', 'reference', 'wealthier', 'entertainment', - 'association', 'provides', 'technology', 'statehood') + CORRECT_WORDS = ('apparent', 'association', 'available', 'classification', + 'effect', 'entertainment', 'fantastic', 'information', + 'integrate', 'international', 'language', 'loosest', + 'manual', 'nothing', 'provides', 'reference', 'statehood', + 'technology', 'virtually', 'wealthier', 'wonderful') KC_A = 4 KC_SPC = 0x2c +KC_QUOT = 0x34 + +TYPO_CHARS = dict( + [ + ("'", KC_QUOT), + (':', KC_SPC), # "Word break" character. + ] + + # Characters a-z. + [(chr(c), c + KC_A - ord('a')) for c in range(ord('a'), ord('z') + 1)] +) + def parse_file(file_name: str) -> List[Tuple[str, str]]: """Parses autocorrections dictionary file. Each line of the file defines one typo and its correction with the syntax "typo -> correction". Blank lines or lines starting with '#' are ignored. The - function validates that typos only have characters a-z and that typos are not - substrings of other typos, otherwise the longer typo would never trigger. + function validates that typos only have characters in TYPO_CHARS, that + typos are not substrings of other typos, and checking that typos don't trigger + on CORRECT_WORDS. Args: file_name: String, path of the autocorrections dictionary. Returns: List of (typo, correction) tuples. """ - + correct_words = ('information', 'available', 'international', 'language', 'loosest', 'reference', 'wealthier', 'entertainment', 'association', 'provides', 'technology', 'statehood') autocorrections = [] typos = set() - line_number = 0 - for line in open(file_name, 'rt'): - line_number += 1 - line = line.strip() - if line and line[0] != '#': - # Parse syntax "typo -> correction", using strip to ignore indenting. - tokens = [token.strip() for token in line.split('->', 1)] - if len(tokens) != 2 or not tokens[0]: - print(f'Error:{line_number}: Invalid syntax: "{line}"') + for line_number, typo, correction in parse_file_lines(file_name): + if typo in typos: + print(f'Warning:{line_number}: Ignoring duplicate typo: "{typo}"') + continue + + # Check that `typo` is valid. + if not(all([c in TYPO_CHARS for c in typo])): + print(f'Error:{line_number}: Typo "{typo}" has ' + 'characters other than ' + ''.join(TYPO_CHARS.keys())) + sys.exit(1) + for other_typo in typos: + if typo in other_typo or other_typo in typo: + print(f'Error:{line_number}: Typos may not be substrings of one ' + f'another, otherwise the longer typo would never trigger: ' + f'"{typo}" vs. "{other_typo}".') sys.exit(1) + if len(typo) < 5: + print(f'Warning:{line_number}: It is suggested that typos are at ' + f'least 5 characters long to avoid false triggers: "{typo}"') - typo, correction = tokens - typo = typo.lower() # Force typos to lowercase. - typo = typo.replace(' ', ':') - - if typo in typos: - print(f'Warning:{line_number}: Ignoring duplicate typo: "{typo}"') - continue + check_typo_against_dictionary(typo, line_number, correct_words) - # Check that `typo` is valid. - if not(all([ord('a') <= ord(c) <= ord('z') or c == ':' for c in typo])): - print(f'Error:{line_number}: Typo "{typo}" has ' - 'characters other than a-z and :.') - sys.exit(1) - for other_typo in typos: - if typo in other_typo or other_typo in typo: - print(f'Error:{line_number}: Typos may not be substrings of one ' - f'another, otherwise the longer typo would never trigger: ' - f'"{typo}" vs. "{other_typo}".') - sys.exit(1) - if len(typo) < 5: - print(f'Warning:{line_number}: It is suggested that typos are at ' - f'least 5 characters long to avoid false triggers: "{typo}"') - - if typo.startswith(':') and typo.endswith(':'): - if typo[1:-1] in CORRECT_WORDS: - print(f'Warning:{line_number}: Typo "{typo}" is a correctly spelled ' - 'dictionary word.') - elif typo.startswith(':') and not typo.endswith(':'): - for word in CORRECT_WORDS: - if word.startswith(typo[1:]): - print(f'Warning:{line_number}: Typo "{typo}" would falsely trigger ' - f'on correctly spelled word "{word}".') - elif not typo.startswith(':') and typo.endswith(':'): - for word in CORRECT_WORDS: - if word.endswith(typo[:-1]): - print(f'Warning:{line_number}: Typo "{typo}" would falsely trigger ' - f'on correctly spelled word "{word}".') - elif not typo.startswith(':') and not typo.endswith(':'): - for word in CORRECT_WORDS: - if typo in word: - print(f'Warning:{line_number}: Typo "{typo}" would falsely trigger ' - f'on correctly spelled word "{word}".') - - autocorrections.append((typo, correction)) - typos.add(typo) + autocorrections.append((typo, correction)) + typos.add(typo) return autocorrections @@ -152,6 +135,47 @@ def make_trie(autocorrections: List[Tuple[str, str]]) -> Dict[str, Any]: return trie +def parse_file_lines(file_name: str) -> Iterator[Tuple[int, str, str]]: + """Parses lines read from `file_name` into typo-correction pairs.""" + + line_number = 0 + for line in open(file_name, 'rt'): + line_number += 1 + line = line.strip() + if line and line[0] != '#': + # Parse syntax "typo -> correction", using strip to ignore indenting. + tokens = [token.strip() for token in line.split('->', 1)] + if len(tokens) != 2 or not tokens[0]: + print(f'Error:{line_number}: Invalid syntax: "{line}"') + sys.exit(1) + + typo, correction = tokens + typo = typo.lower() # Force typos to lowercase. + typo = typo.replace(' ', ':') + + yield line_number, typo, correction + + +def check_typo_against_dictionary(typo: str, line_number: int, correct_words) -> None: + """Checks `typo` against English dictionary words.""" + + if typo.startswith(':') and typo.endswith(':'): + if typo[1:-1] in correct_words: + print(f'Warning:{line_number}: Typo "{typo}" is a correctly spelled dictionary word.') + elif typo.startswith(':') and not typo.endswith(':'): + for word in correct_words: + if word.startswith(typo[1:]): + print(f'Warning:{line_number}: Typo "{typo}" would falsely trigger on correctly spelled word "{word}".') + elif not typo.startswith(':') and typo.endswith(':'): + for word in correct_words: + if word.endswith(typo[:-1]): + print(f'Warning:{line_number}: Typo "{typo}" would falsely trigger on correctly spelled word "{word}".') + elif not typo.startswith(':') and not typo.endswith(':'): + for word in correct_words: + if typo in word: + print(f'Warning:{line_number}: Typo "{typo}" would falsely trigger on correctly spelled word "{word}".') + + def serialize_trie(autocorrections: List[Tuple[str, str]], trie: Dict[str, Any]) -> List[int]: """Serializes trie and correction data in a form readable by the C code. @@ -165,7 +189,7 @@ def serialize_trie(autocorrections: List[Tuple[str, str]], table = [] # Traverse trie in depth first order. - def traverse(trie_node): + def traverse(trie_node: Dict[str, Any]) -> Dict[str, Any]: if 'LEAF' in trie_node: # Handle a leaf trie node. typo, correction = trie_node['LEAF'] word_boundary_ending = typo[-1] == ':' @@ -200,37 +224,35 @@ def serialize_trie(autocorrections: List[Tuple[str, str]], traverse(trie) - def serialize(e): - def kc_code(c): - if ord('a') <= ord(c) <= ord('z'): - return ord(c) - ord('a') + KC_A - elif c == ':': - return KC_SPC - else: - raise ValueError(f'Invalid character: {c}') - - encode_link = lambda link: [link['byte_offset'] & 255, - link['byte_offset'] >> 8] - + def serialize(e: Dict[str, Any]) -> List[int]: if not e['links']: # Handle a leaf table entry. return e['data'] elif len(e['links']) == 1: # Handle a chain table entry. - return list(map(kc_code, e['chars'])) + [0] #+ encode_link(e['links'][0])) + return [TYPO_CHARS[c] for c in e['chars']] + [0] else: # Handle a branch table entry. data = [] for c, link in zip(e['chars'], e['links']): - data += [kc_code(c) | (0 if data else 64)] + encode_link(link) + data += [TYPO_CHARS[c] | (0 if data else 64)] + encode_link(link) return data + [0] byte_offset = 0 for e in table: # To encode links, first compute byte offset of each entry. e['byte_offset'] = byte_offset byte_offset += len(serialize(e)) - assert 0 <= byte_offset <= 0xffff return [b for e in table for b in serialize(e)] # Serialize final table. +def encode_link(link: Dict[str, Any]) -> List[int]: + """Encodes a node link as two bytes.""" + byte_offset = link['byte_offset'] + if not (0 <= byte_offset <= 0xffff): + print('Error: The autocorrection table is too large, a node link exceeds ' + '64KB limit. Try reducing the autocorrection dict to fewer entries.') + sys.exit(1) + return [byte_offset & 255, byte_offset >> 8] + + def write_generated_code(autocorrections: List[Tuple[str, str]], data: List[int], file_name: str) -> None: @@ -242,7 +264,10 @@ def write_generated_code(autocorrections: List[Tuple[str, str]], file_name: String, path of the output C file. """ assert all(0 <= b <= 255 for b in data) - typo_len = lambda e: len(e[0]) + + def typo_len(e: Tuple[str, str]) -> int: + return len(e[0]) + min_typo = min(autocorrections, key=typo_len)[0] max_typo = max(autocorrections, key=typo_len)[0] generated_code = ''.join([ @@ -252,9 +277,8 @@ def write_generated_code(autocorrections: List[Tuple[str, str]], for typo, correction in autocorrections)), f'\n#define AUTOCORRECTION_MIN_LENGTH {len(min_typo)} // "{min_typo}"\n', f'#define AUTOCORRECTION_MAX_LENGTH {len(max_typo)} // "{max_typo}"\n\n', - f'#define DICTIONARY_SIZE {len(data)}\n\n', - textwrap.fill('static const uint8_t autocorrection_data[DICTIONARY_SIZE] PROGMEM = {%s};' % ( - ', '.join(map(str, data))), width=120, subsequent_indent=' '), + textwrap.fill('static const uint8_t autocorrection_data[%d] PROGMEM = {%s};' % ( + len(data), ', '.join(map(str, data))), width=80, subsequent_indent=' '), '\n\n']) with open(file_name, 'wt') as f: |