summaryrefslogtreecommitdiffstats
path: root/sql/share/insert_translations_into_errmsg.py
blob: 6c5677362c218c135f67f2984cec8bb758e3868d (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
#!/usr/bin/python3
import pdb
import re
from dataclasses import dataclass
import bisect
import argparse
################################################################################
# How this script works
# The script is mainly driven by a state machine that consumes input
# and produces output "record-by-record"in an iterator-like fashion. Coroutines,
# are used to consume each of the inputs only when they are needed for each
# state, assuring proper rate-matching as 3 input sources are utilized to
# determine the insertion point of the new language, and not all
# 3 inputs are consumed at the same rate.
# The following steps are performed by the script to insert translations
# of the new language into a copy of the errmsg-utf8.txt file:
# 1. Load the source file and map out the lines in a data structure
# 2. Start reading the source file line by line.
#     2.1 For each line you can be in
#          2.1.1 SEARCHING_FOR_NEXT_HEADER state
#                - In this state, we continually search the incoming 
#                  lines from the source file for a string starting
#                  with a series of capital letters (^[A-Z]+).
#                - Write each line to the output file, which is a copy
#                  of 'errmsg-utf8.txt'.
#                - Change the state to CALCULATE_INSERT_POINT if a string matching
#                  the previous criteria is found
#                - Take the string starting with capitals and save it in
#                  the current_header variable"
#          2.1.2 CALCULATE_INSERT_POINT state
#                - Go to the data structure for the source file and
#                  using te current_header as a key, read out the
#                  value part of the structure. The value part should be
#                  a list .
#                - Find the insert point for the new language
#                  error message based on the list from the previous step.
#                - Change state to PERFORM_INSERT
#          2.1.3 PERFORM_INSERT state
#                - Read the source file and copy out each line to the output
#                  file (the copy of 'errmsg-utf8.txt').
#                - Continue reading the source file and checking if the
#                  insert point has been reached. Once it has been reached
#                  insert the new language in the output file.
#                - Change state to SEARCHING_FOR_NEXT_HEADER 
################################################################################

class SectionList(list):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.comment_locations = []


def read_file(filename):
    ''' A function that reads a file in one go '''
    with open(filename, 'r') as f:
        data = f.read()
    return data

def obtain_key_value_from_translation_line(translation_match, line):
    if translation_match :
        return translation_match.groups()
    else:
        return '#', line # Here we assume some other line type
    #that is not a language type and its translation. Return as is with hash character as key

def map_out_source_data(data):
    '''
    Load the source error message file into a navigable data structure (a lists of lists
    has been chosen to ensure the source order is not disrupted)
    '''
    # use a regex to split the data into sections
    sections = re.split(r'\n(?=[A-Z])', data)
    # create a dictionary to store the processed data
    data_dict = {}
    # process each section
    for section in sections:
        if not re.match(r'^[A-Z]+', section):
            continue
        # split the section into lines
        lines = section.split('\n')
        # the title of the section is the first line
        title = lines[0].strip()
        # create a list for the key-value pairs in this section
        section_list = []
        comment_list = []
        prev_key = ''
        current_line_loc = 0
        # process each line (except the first one)
        for line in lines[1:]:
            # split the line into a key and a value
            print(line)
            translation_match = re.match(r'\s*([a-z\-]+) \"(.*)\"', line)
            key, value = obtain_key_value_from_translation_line(translation_match,line)
            # add the key-value pair to the section list
            if key != '#':
                section_list.append([key, value])
                prev_key = key
            elif '#' in value:
                # Current line in file is a comment, we want to keep
                # track of its location in the original file
                comment_list.append(current_line_loc)
            current_line_loc += 1
        section_list_with_attributes = SectionList(section_list)
        section_list_with_attributes.comment_locations = comment_list.copy()
        
        # add the section list to the main list
        data_dict[title] = section_list_with_attributes
    return data_dict

def single_file_reader(input_file_name):
    with open(input_file_name, 'r') as input_file:
        for line in input_file:
            yield line

def single_file_writer(output_file_name):
    with open(output_file_name, 'w') as output_file:
        while True:
            line = yield
            output_file.write(line)

def double_file_reader(file1, file2):
    with open(file1, 'r') as f1, open(file2, 'r') as f2:
        for line1, line2 in zip(f1, f2):
            yield (line1, line2)

def detect_language(file_name):
    with open(file_name, 'r') as f:
        first_line = f.readline()
        lang = first_line.split()[0]
        return lang

def detect_leading_whitespace_from_source_lang_file(file_name):
    with open(file_name, 'r') as f:
        first_line = f.readline()
        whitespace = first_line[:len(first_line) - len(first_line.lstrip())]
        return whitespace

@dataclass
class StateControlData:
    """ Class for keeping track of state machine information"""
    current_state: str = ''
    current_header: str = ''
    detected_dest_lang: str = ''
    whitespace: str = ''
    insert_point_index: int = 0
    stop_state_machine: bool = False
    mapped_input_data: any = None
    input_reader: any = None
    output_writer: any = None
    eng_to_new_lang_translation_mapper: any = None
    
    
def searching_for_next_header_action(state_machine_data):
    for input_line in state_machine_data.input_reader:
        if re.match(r'^[A-Z]+', input_line):
            state_machine_data.current_header = input_line.strip()
            state_machine_data.current_state = "CALCULATE_INSERT_POINT"
            state_machine_data.output_writer.send(input_line)
            break
        state_machine_data.output_writer.send(input_line)
    else:
        state_machine_data.stop_state_machine = True
    
    return state_machine_data

def calculate_insert_point_action(state_machine_data):
    detected_dest_lang = state_machine_data.detected_dest_lang
    current_header = state_machine_data.current_header

    old_lang_list = state_machine_data.mapped_input_data[current_header]

    # Determine the spot where the new translation should fit in
    # the list of translations
    index = bisect.bisect([lang for lang, _ in old_lang_list], detected_dest_lang)
    
    state_machine_data.insert_point_index = index
    state_machine_data.current_state = "PERFORM_INSERT"
    
    return state_machine_data

def finding_insert_point_action(state_machine_data):
    def adjust_for_comments_occuring_before_insert_point(insert_point_index, comment_locations):
        for comment_loc in comment_locations:
            if comment_loc <= insert_point_index:
                insert_point_index += 1
        return insert_point_index
            
    eng_to_new_lang_tuple = next(state_machine_data.eng_to_new_lang_translation_mapper)
    current_header = state_machine_data.current_header
    old_lang_list = state_machine_data.mapped_input_data[current_header]
    index = adjust_for_comments_occuring_before_insert_point(state_machine_data.insert_point_index, old_lang_list.comment_locations)
    detected_whitespace = state_machine_data.whitespace

    
    for i,elem in enumerate(old_lang_list):
        if index == i:
            state_machine_data.output_writer.send(detected_whitespace + eng_to_new_lang_tuple[1])
        
        input_line = next(state_machine_data.input_reader, None)
        if input_line is None:
            pdb.set_trace()
        state_machine_data.output_writer.send(input_line) 
        
    
    # New lang should be placed last
    if index >= len(old_lang_list):    
        state_machine_data.output_writer.send(detected_whitespace + eng_to_new_lang_tuple[1])
        #state_machine_data.output_writer.send("\n") # The lines are stripped so we add a carriage-return

    state_machine_data.current_state = "SEARCHING_FOR_NEXT_HEADER"
    return state_machine_data
        

def language_inserter(data_dict, english_lang_translations_file, new_lang_translations_file):
    '''
    Inserts the new language into a copy of errmsg-utf8.txt, using a state machine to
    keep track of what step it is to take. Coroutines are used to keep control flow
    tractable when dealing with 4 separate files
    '''
    state_machine = {
        "SEARCHING_FOR_NEXT_HEADER" : searching_for_next_header_action,
        "CALCULATE_INSERT_POINT" : calculate_insert_point_action,
        "PERFORM_INSERT" : finding_insert_point_action
    }

    state_machine_data = StateControlData()
    
    state_machine_data.output_writer = single_file_writer('errmsg-utf8-with-new-language.txt')
    next(state_machine_data.output_writer)
    state_machine_data.input_reader = single_file_reader('errmsg-utf8.txt')
    state_machine_data.eng_to_new_lang_translation_mapper = double_file_reader(english_lang_translations_file, new_lang_translations_file)

    state_machine_data.detected_dest_lang = detect_language(new_lang_translations_file)
    state_machine_data.whitespace = detect_leading_whitespace_from_source_lang_file(english_lang_translations_file)
    state_machine_data.current_header =''
    state_machine_data.current_state = "SEARCHING_FOR_NEXT_HEADER"
    state_machine_data.mapped_input_data = data_dict

    while not state_machine_data.stop_state_machine:
        current_state = state_machine_data.current_state
        state_machine_data = state_machine[current_state](state_machine_data)
        

def main():
    ''' main function '''
    parser = argparse.ArgumentParser(description='''Given errmsg-utf8.txt, 
    an english language file extracted from errmsg-utf8.txt and another
    file with translations into a new language from the english language
    file, reinsert the new language translations into their correct
    positions in a copy of errmsg-utf8.txt.''')
    parser.add_argument('errmsg_file', type=str, help='Path to errmsg-utf8.txt')
    parser.add_argument('english_lang_translations_file', type=str, help='Path to English lang translations file')
    parser.add_argument('new_lang_translations_file', type=str, help='Path to new lang translations file')

    args = parser.parse_args()
    errmsg_file = args.errmsg_file
    english_lang_translations_file = args.english_lang_translations_file
    new_lang_translations_file = args.new_lang_translations_file
    
    data = read_file(errmsg_file)
    data_dict = map_out_source_data(data)
    print('Original file errmsg-utf8.txt has been successfully mapped into memory.')
    print('''Now starting insertion process into errmsg-utf8-with-new-language.txt which is
 a copy of errmsg-utf8.txt''')

    # In case you want to hard code the language source files, uncomment
    # the below two lines, set the new language file name and disable
    # argument parsing.
    #english_lang_translations_file = 'all_english_text_in_errmsg-utf8.txt'
    #new_lang_translations_file = 'all_swahili_text_in_errmsg-utf8.txt'
    language_inserter(data_dict, english_lang_translations_file, new_lang_translations_file)
    print("Insertion of new language translations into errmsg-utf8-with-new-language.txt is done")

# call the main function
if __name__ == "__main__":
    main()