diff options
Diffstat (limited to 'sql/share/insert_translations_into_errmsg.py')
-rwxr-xr-x | sql/share/insert_translations_into_errmsg.py | 279 |
1 files changed, 279 insertions, 0 deletions
diff --git a/sql/share/insert_translations_into_errmsg.py b/sql/share/insert_translations_into_errmsg.py new file mode 100755 index 00000000..6c567736 --- /dev/null +++ b/sql/share/insert_translations_into_errmsg.py @@ -0,0 +1,279 @@ +#!/usr/bin/python3 +import pdb +import re +from dataclasses import dataclass +import bisect +import argparse +################################################################################ +# How this script works +# The script is mainly driven by a state machine that consumes input +# and produces output "record-by-record"in an iterator-like fashion. Coroutines, +# are used to consume each of the inputs only when they are needed for each +# state, assuring proper rate-matching as 3 input sources are utilized to +# determine the insertion point of the new language, and not all +# 3 inputs are consumed at the same rate. +# The following steps are performed by the script to insert translations +# of the new language into a copy of the errmsg-utf8.txt file: +# 1. Load the source file and map out the lines in a data structure +# 2. Start reading the source file line by line. +# 2.1 For each line you can be in +# 2.1.1 SEARCHING_FOR_NEXT_HEADER state +# - In this state, we continually search the incoming +# lines from the source file for a string starting +# with a series of capital letters (^[A-Z]+). +# - Write each line to the output file, which is a copy +# of 'errmsg-utf8.txt'. +# - Change the state to CALCULATE_INSERT_POINT if a string matching +# the previous criteria is found +# - Take the string starting with capitals and save it in +# the current_header variable" +# 2.1.2 CALCULATE_INSERT_POINT state +# - Go to the data structure for the source file and +# using te current_header as a key, read out the +# value part of the structure. The value part should be +# a list . +# - Find the insert point for the new language +# error message based on the list from the previous step. +# - Change state to PERFORM_INSERT +# 2.1.3 PERFORM_INSERT state +# - Read the source file and copy out each line to the output +# file (the copy of 'errmsg-utf8.txt'). +# - Continue reading the source file and checking if the +# insert point has been reached. Once it has been reached +# insert the new language in the output file. +# - Change state to SEARCHING_FOR_NEXT_HEADER +################################################################################ + +class SectionList(list): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.comment_locations = [] + + +def read_file(filename): + ''' A function that reads a file in one go ''' + with open(filename, 'r') as f: + data = f.read() + return data + +def obtain_key_value_from_translation_line(translation_match, line): + if translation_match : + return translation_match.groups() + else: + return '#', line # Here we assume some other line type + #that is not a language type and its translation. Return as is with hash character as key + +def map_out_source_data(data): + ''' + Load the source error message file into a navigable data structure (a lists of lists + has been chosen to ensure the source order is not disrupted) + ''' + # use a regex to split the data into sections + sections = re.split(r'\n(?=[A-Z])', data) + # create a dictionary to store the processed data + data_dict = {} + # process each section + for section in sections: + if not re.match(r'^[A-Z]+', section): + continue + # split the section into lines + lines = section.split('\n') + # the title of the section is the first line + title = lines[0].strip() + # create a list for the key-value pairs in this section + section_list = [] + comment_list = [] + prev_key = '' + current_line_loc = 0 + # process each line (except the first one) + for line in lines[1:]: + # split the line into a key and a value + print(line) + translation_match = re.match(r'\s*([a-z\-]+) \"(.*)\"', line) + key, value = obtain_key_value_from_translation_line(translation_match,line) + # add the key-value pair to the section list + if key != '#': + section_list.append([key, value]) + prev_key = key + elif '#' in value: + # Current line in file is a comment, we want to keep + # track of its location in the original file + comment_list.append(current_line_loc) + current_line_loc += 1 + section_list_with_attributes = SectionList(section_list) + section_list_with_attributes.comment_locations = comment_list.copy() + + # add the section list to the main list + data_dict[title] = section_list_with_attributes + return data_dict + +def single_file_reader(input_file_name): + with open(input_file_name, 'r') as input_file: + for line in input_file: + yield line + +def single_file_writer(output_file_name): + with open(output_file_name, 'w') as output_file: + while True: + line = yield + output_file.write(line) + +def double_file_reader(file1, file2): + with open(file1, 'r') as f1, open(file2, 'r') as f2: + for line1, line2 in zip(f1, f2): + yield (line1, line2) + +def detect_language(file_name): + with open(file_name, 'r') as f: + first_line = f.readline() + lang = first_line.split()[0] + return lang + +def detect_leading_whitespace_from_source_lang_file(file_name): + with open(file_name, 'r') as f: + first_line = f.readline() + whitespace = first_line[:len(first_line) - len(first_line.lstrip())] + return whitespace + +@dataclass +class StateControlData: + """ Class for keeping track of state machine information""" + current_state: str = '' + current_header: str = '' + detected_dest_lang: str = '' + whitespace: str = '' + insert_point_index: int = 0 + stop_state_machine: bool = False + mapped_input_data: any = None + input_reader: any = None + output_writer: any = None + eng_to_new_lang_translation_mapper: any = None + + +def searching_for_next_header_action(state_machine_data): + for input_line in state_machine_data.input_reader: + if re.match(r'^[A-Z]+', input_line): + state_machine_data.current_header = input_line.strip() + state_machine_data.current_state = "CALCULATE_INSERT_POINT" + state_machine_data.output_writer.send(input_line) + break + state_machine_data.output_writer.send(input_line) + else: + state_machine_data.stop_state_machine = True + + return state_machine_data + +def calculate_insert_point_action(state_machine_data): + detected_dest_lang = state_machine_data.detected_dest_lang + current_header = state_machine_data.current_header + + old_lang_list = state_machine_data.mapped_input_data[current_header] + + # Determine the spot where the new translation should fit in + # the list of translations + index = bisect.bisect([lang for lang, _ in old_lang_list], detected_dest_lang) + + state_machine_data.insert_point_index = index + state_machine_data.current_state = "PERFORM_INSERT" + + return state_machine_data + +def finding_insert_point_action(state_machine_data): + def adjust_for_comments_occuring_before_insert_point(insert_point_index, comment_locations): + for comment_loc in comment_locations: + if comment_loc <= insert_point_index: + insert_point_index += 1 + return insert_point_index + + eng_to_new_lang_tuple = next(state_machine_data.eng_to_new_lang_translation_mapper) + current_header = state_machine_data.current_header + old_lang_list = state_machine_data.mapped_input_data[current_header] + index = adjust_for_comments_occuring_before_insert_point(state_machine_data.insert_point_index, old_lang_list.comment_locations) + detected_whitespace = state_machine_data.whitespace + + + for i,elem in enumerate(old_lang_list): + if index == i: + state_machine_data.output_writer.send(detected_whitespace + eng_to_new_lang_tuple[1]) + + input_line = next(state_machine_data.input_reader, None) + if input_line is None: + pdb.set_trace() + state_machine_data.output_writer.send(input_line) + + + # New lang should be placed last + if index >= len(old_lang_list): + state_machine_data.output_writer.send(detected_whitespace + eng_to_new_lang_tuple[1]) + #state_machine_data.output_writer.send("\n") # The lines are stripped so we add a carriage-return + + state_machine_data.current_state = "SEARCHING_FOR_NEXT_HEADER" + return state_machine_data + + +def language_inserter(data_dict, english_lang_translations_file, new_lang_translations_file): + ''' + Inserts the new language into a copy of errmsg-utf8.txt, using a state machine to + keep track of what step it is to take. Coroutines are used to keep control flow + tractable when dealing with 4 separate files + ''' + state_machine = { + "SEARCHING_FOR_NEXT_HEADER" : searching_for_next_header_action, + "CALCULATE_INSERT_POINT" : calculate_insert_point_action, + "PERFORM_INSERT" : finding_insert_point_action + } + + state_machine_data = StateControlData() + + state_machine_data.output_writer = single_file_writer('errmsg-utf8-with-new-language.txt') + next(state_machine_data.output_writer) + state_machine_data.input_reader = single_file_reader('errmsg-utf8.txt') + state_machine_data.eng_to_new_lang_translation_mapper = double_file_reader(english_lang_translations_file, new_lang_translations_file) + + state_machine_data.detected_dest_lang = detect_language(new_lang_translations_file) + state_machine_data.whitespace = detect_leading_whitespace_from_source_lang_file(english_lang_translations_file) + state_machine_data.current_header ='' + state_machine_data.current_state = "SEARCHING_FOR_NEXT_HEADER" + state_machine_data.mapped_input_data = data_dict + + while not state_machine_data.stop_state_machine: + current_state = state_machine_data.current_state + state_machine_data = state_machine[current_state](state_machine_data) + + +def main(): + ''' main function ''' + parser = argparse.ArgumentParser(description='''Given errmsg-utf8.txt, + an english language file extracted from errmsg-utf8.txt and another + file with translations into a new language from the english language + file, reinsert the new language translations into their correct + positions in a copy of errmsg-utf8.txt.''') + parser.add_argument('errmsg_file', type=str, help='Path to errmsg-utf8.txt') + parser.add_argument('english_lang_translations_file', type=str, help='Path to English lang translations file') + parser.add_argument('new_lang_translations_file', type=str, help='Path to new lang translations file') + + args = parser.parse_args() + errmsg_file = args.errmsg_file + english_lang_translations_file = args.english_lang_translations_file + new_lang_translations_file = args.new_lang_translations_file + + data = read_file(errmsg_file) + data_dict = map_out_source_data(data) + print('Original file errmsg-utf8.txt has been successfully mapped into memory.') + print('''Now starting insertion process into errmsg-utf8-with-new-language.txt which is + a copy of errmsg-utf8.txt''') + + # In case you want to hard code the language source files, uncomment + # the below two lines, set the new language file name and disable + # argument parsing. + #english_lang_translations_file = 'all_english_text_in_errmsg-utf8.txt' + #new_lang_translations_file = 'all_swahili_text_in_errmsg-utf8.txt' + language_inserter(data_dict, english_lang_translations_file, new_lang_translations_file) + print("Insertion of new language translations into errmsg-utf8-with-new-language.txt is done") + +# call the main function +if __name__ == "__main__": + main() + + + |