summaryrefslogtreecommitdiffstats
path: root/sql/share/insert_translations_into_errmsg.py
diff options
context:
space:
mode:
Diffstat (limited to 'sql/share/insert_translations_into_errmsg.py')
-rwxr-xr-xsql/share/insert_translations_into_errmsg.py279
1 files changed, 279 insertions, 0 deletions
diff --git a/sql/share/insert_translations_into_errmsg.py b/sql/share/insert_translations_into_errmsg.py
new file mode 100755
index 00000000..6c567736
--- /dev/null
+++ b/sql/share/insert_translations_into_errmsg.py
@@ -0,0 +1,279 @@
+#!/usr/bin/python3
+import pdb
+import re
+from dataclasses import dataclass
+import bisect
+import argparse
+################################################################################
+# How this script works
+# The script is mainly driven by a state machine that consumes input
+# and produces output "record-by-record"in an iterator-like fashion. Coroutines,
+# are used to consume each of the inputs only when they are needed for each
+# state, assuring proper rate-matching as 3 input sources are utilized to
+# determine the insertion point of the new language, and not all
+# 3 inputs are consumed at the same rate.
+# The following steps are performed by the script to insert translations
+# of the new language into a copy of the errmsg-utf8.txt file:
+# 1. Load the source file and map out the lines in a data structure
+# 2. Start reading the source file line by line.
+# 2.1 For each line you can be in
+# 2.1.1 SEARCHING_FOR_NEXT_HEADER state
+# - In this state, we continually search the incoming
+# lines from the source file for a string starting
+# with a series of capital letters (^[A-Z]+).
+# - Write each line to the output file, which is a copy
+# of 'errmsg-utf8.txt'.
+# - Change the state to CALCULATE_INSERT_POINT if a string matching
+# the previous criteria is found
+# - Take the string starting with capitals and save it in
+# the current_header variable"
+# 2.1.2 CALCULATE_INSERT_POINT state
+# - Go to the data structure for the source file and
+# using te current_header as a key, read out the
+# value part of the structure. The value part should be
+# a list .
+# - Find the insert point for the new language
+# error message based on the list from the previous step.
+# - Change state to PERFORM_INSERT
+# 2.1.3 PERFORM_INSERT state
+# - Read the source file and copy out each line to the output
+# file (the copy of 'errmsg-utf8.txt').
+# - Continue reading the source file and checking if the
+# insert point has been reached. Once it has been reached
+# insert the new language in the output file.
+# - Change state to SEARCHING_FOR_NEXT_HEADER
+################################################################################
+
+class SectionList(list):
+ def __init__(self, *args, **kwargs):
+ super().__init__(*args, **kwargs)
+ self.comment_locations = []
+
+
+def read_file(filename):
+ ''' A function that reads a file in one go '''
+ with open(filename, 'r') as f:
+ data = f.read()
+ return data
+
+def obtain_key_value_from_translation_line(translation_match, line):
+ if translation_match :
+ return translation_match.groups()
+ else:
+ return '#', line # Here we assume some other line type
+ #that is not a language type and its translation. Return as is with hash character as key
+
+def map_out_source_data(data):
+ '''
+ Load the source error message file into a navigable data structure (a lists of lists
+ has been chosen to ensure the source order is not disrupted)
+ '''
+ # use a regex to split the data into sections
+ sections = re.split(r'\n(?=[A-Z])', data)
+ # create a dictionary to store the processed data
+ data_dict = {}
+ # process each section
+ for section in sections:
+ if not re.match(r'^[A-Z]+', section):
+ continue
+ # split the section into lines
+ lines = section.split('\n')
+ # the title of the section is the first line
+ title = lines[0].strip()
+ # create a list for the key-value pairs in this section
+ section_list = []
+ comment_list = []
+ prev_key = ''
+ current_line_loc = 0
+ # process each line (except the first one)
+ for line in lines[1:]:
+ # split the line into a key and a value
+ print(line)
+ translation_match = re.match(r'\s*([a-z\-]+) \"(.*)\"', line)
+ key, value = obtain_key_value_from_translation_line(translation_match,line)
+ # add the key-value pair to the section list
+ if key != '#':
+ section_list.append([key, value])
+ prev_key = key
+ elif '#' in value:
+ # Current line in file is a comment, we want to keep
+ # track of its location in the original file
+ comment_list.append(current_line_loc)
+ current_line_loc += 1
+ section_list_with_attributes = SectionList(section_list)
+ section_list_with_attributes.comment_locations = comment_list.copy()
+
+ # add the section list to the main list
+ data_dict[title] = section_list_with_attributes
+ return data_dict
+
+def single_file_reader(input_file_name):
+ with open(input_file_name, 'r') as input_file:
+ for line in input_file:
+ yield line
+
+def single_file_writer(output_file_name):
+ with open(output_file_name, 'w') as output_file:
+ while True:
+ line = yield
+ output_file.write(line)
+
+def double_file_reader(file1, file2):
+ with open(file1, 'r') as f1, open(file2, 'r') as f2:
+ for line1, line2 in zip(f1, f2):
+ yield (line1, line2)
+
+def detect_language(file_name):
+ with open(file_name, 'r') as f:
+ first_line = f.readline()
+ lang = first_line.split()[0]
+ return lang
+
+def detect_leading_whitespace_from_source_lang_file(file_name):
+ with open(file_name, 'r') as f:
+ first_line = f.readline()
+ whitespace = first_line[:len(first_line) - len(first_line.lstrip())]
+ return whitespace
+
+@dataclass
+class StateControlData:
+ """ Class for keeping track of state machine information"""
+ current_state: str = ''
+ current_header: str = ''
+ detected_dest_lang: str = ''
+ whitespace: str = ''
+ insert_point_index: int = 0
+ stop_state_machine: bool = False
+ mapped_input_data: any = None
+ input_reader: any = None
+ output_writer: any = None
+ eng_to_new_lang_translation_mapper: any = None
+
+
+def searching_for_next_header_action(state_machine_data):
+ for input_line in state_machine_data.input_reader:
+ if re.match(r'^[A-Z]+', input_line):
+ state_machine_data.current_header = input_line.strip()
+ state_machine_data.current_state = "CALCULATE_INSERT_POINT"
+ state_machine_data.output_writer.send(input_line)
+ break
+ state_machine_data.output_writer.send(input_line)
+ else:
+ state_machine_data.stop_state_machine = True
+
+ return state_machine_data
+
+def calculate_insert_point_action(state_machine_data):
+ detected_dest_lang = state_machine_data.detected_dest_lang
+ current_header = state_machine_data.current_header
+
+ old_lang_list = state_machine_data.mapped_input_data[current_header]
+
+ # Determine the spot where the new translation should fit in
+ # the list of translations
+ index = bisect.bisect([lang for lang, _ in old_lang_list], detected_dest_lang)
+
+ state_machine_data.insert_point_index = index
+ state_machine_data.current_state = "PERFORM_INSERT"
+
+ return state_machine_data
+
+def finding_insert_point_action(state_machine_data):
+ def adjust_for_comments_occuring_before_insert_point(insert_point_index, comment_locations):
+ for comment_loc in comment_locations:
+ if comment_loc <= insert_point_index:
+ insert_point_index += 1
+ return insert_point_index
+
+ eng_to_new_lang_tuple = next(state_machine_data.eng_to_new_lang_translation_mapper)
+ current_header = state_machine_data.current_header
+ old_lang_list = state_machine_data.mapped_input_data[current_header]
+ index = adjust_for_comments_occuring_before_insert_point(state_machine_data.insert_point_index, old_lang_list.comment_locations)
+ detected_whitespace = state_machine_data.whitespace
+
+
+ for i,elem in enumerate(old_lang_list):
+ if index == i:
+ state_machine_data.output_writer.send(detected_whitespace + eng_to_new_lang_tuple[1])
+
+ input_line = next(state_machine_data.input_reader, None)
+ if input_line is None:
+ pdb.set_trace()
+ state_machine_data.output_writer.send(input_line)
+
+
+ # New lang should be placed last
+ if index >= len(old_lang_list):
+ state_machine_data.output_writer.send(detected_whitespace + eng_to_new_lang_tuple[1])
+ #state_machine_data.output_writer.send("\n") # The lines are stripped so we add a carriage-return
+
+ state_machine_data.current_state = "SEARCHING_FOR_NEXT_HEADER"
+ return state_machine_data
+
+
+def language_inserter(data_dict, english_lang_translations_file, new_lang_translations_file):
+ '''
+ Inserts the new language into a copy of errmsg-utf8.txt, using a state machine to
+ keep track of what step it is to take. Coroutines are used to keep control flow
+ tractable when dealing with 4 separate files
+ '''
+ state_machine = {
+ "SEARCHING_FOR_NEXT_HEADER" : searching_for_next_header_action,
+ "CALCULATE_INSERT_POINT" : calculate_insert_point_action,
+ "PERFORM_INSERT" : finding_insert_point_action
+ }
+
+ state_machine_data = StateControlData()
+
+ state_machine_data.output_writer = single_file_writer('errmsg-utf8-with-new-language.txt')
+ next(state_machine_data.output_writer)
+ state_machine_data.input_reader = single_file_reader('errmsg-utf8.txt')
+ state_machine_data.eng_to_new_lang_translation_mapper = double_file_reader(english_lang_translations_file, new_lang_translations_file)
+
+ state_machine_data.detected_dest_lang = detect_language(new_lang_translations_file)
+ state_machine_data.whitespace = detect_leading_whitespace_from_source_lang_file(english_lang_translations_file)
+ state_machine_data.current_header =''
+ state_machine_data.current_state = "SEARCHING_FOR_NEXT_HEADER"
+ state_machine_data.mapped_input_data = data_dict
+
+ while not state_machine_data.stop_state_machine:
+ current_state = state_machine_data.current_state
+ state_machine_data = state_machine[current_state](state_machine_data)
+
+
+def main():
+ ''' main function '''
+ parser = argparse.ArgumentParser(description='''Given errmsg-utf8.txt,
+ an english language file extracted from errmsg-utf8.txt and another
+ file with translations into a new language from the english language
+ file, reinsert the new language translations into their correct
+ positions in a copy of errmsg-utf8.txt.''')
+ parser.add_argument('errmsg_file', type=str, help='Path to errmsg-utf8.txt')
+ parser.add_argument('english_lang_translations_file', type=str, help='Path to English lang translations file')
+ parser.add_argument('new_lang_translations_file', type=str, help='Path to new lang translations file')
+
+ args = parser.parse_args()
+ errmsg_file = args.errmsg_file
+ english_lang_translations_file = args.english_lang_translations_file
+ new_lang_translations_file = args.new_lang_translations_file
+
+ data = read_file(errmsg_file)
+ data_dict = map_out_source_data(data)
+ print('Original file errmsg-utf8.txt has been successfully mapped into memory.')
+ print('''Now starting insertion process into errmsg-utf8-with-new-language.txt which is
+ a copy of errmsg-utf8.txt''')
+
+ # In case you want to hard code the language source files, uncomment
+ # the below two lines, set the new language file name and disable
+ # argument parsing.
+ #english_lang_translations_file = 'all_english_text_in_errmsg-utf8.txt'
+ #new_lang_translations_file = 'all_swahili_text_in_errmsg-utf8.txt'
+ language_inserter(data_dict, english_lang_translations_file, new_lang_translations_file)
+ print("Insertion of new language translations into errmsg-utf8-with-new-language.txt is done")
+
+# call the main function
+if __name__ == "__main__":
+ main()
+
+
+