""" A direct translation of the webvtt file parsing algorithm. See https://w3c.github.io/webvtt/#file-parsing for documentation """ import re import string SPACE_CHARACTERS = [' ', '\t', '\n', '\f', '\r'] SPACE_SPLIT_PATTERN = r"[{}]*".format(''.join(SPACE_CHARACTERS)) DIGITS = string.digits class DictInit: def __init__(self, **dict): self.__dict__.update(dict) class VTTCue(DictInit): pass class VTTRegion(DictInit): pass class Stylesheet(DictInit): pass class W3CParser: input = None position = None def collect_characters(self, condition): result = "" while self.position < len(self.input) and condition(self.input[self.position]): result += self.input[self.position] self.position += 1 return result def skip_whitespace(self): self.collect_characters(lambda c: c in SPACE_CHARACTERS) def parse_percentage_string(self, input): 'parse a percentage string' # 1. input = input # 2. if not re.match(r'^\d+(\.\d+)?%$', input): return None # 3. percentage = float(input[:-1]) # 4. if percentage < 0 or percentage > 100: return None # 5. return percentage class VTTParser(W3CParser): def __init__(self, input): self.input = input self.position = 0 self.seen_cue = False self.text_tracks = [] self.stylesheets = [] self.regions = [] self.errors = [] def parse(self): 'WebVTT parser algorithm' # 1. self.input = self.input.replace('\0', '\ufffd').replace('\r\n', '\n').replace('\r', '\n') # 2. self.position = 0 # 3. self.seen_cue = False # 4. if len(self.input) < 6: self.errors.append('input too small for webvtt') return # 5. if len(self.input) == 6 and self.input != 'WEBVTT': self.errors.append('invalid webvtt header') return # 6. if len(self.input) > 6: if not (self.input[0:6] == 'WEBVTT' and self.input[6] in ['\u0020', '\u0009', '\u000A']): self.errors.append('invalid webvtt header') return # 7. self.collect_characters(lambda c: c != '\n') # 8. if self.position >= len(self.input): return # 9. if self.input[self.position] == '\n': self.position += 1 # 10. if self.position >= len(self.input): return # 11. if self.input[self.position] != '\n': self.collect_block(in_header = True) else: self.position += 1 # 12. self.collect_characters(lambda c: c == '\n') # 13. self.regions = [] # 14. while self.position < len(self.input): # 1. block = self.collect_block() # 2. if isinstance(block, VTTCue): self.text_tracks.append(block) # 3. elif isinstance(block, Stylesheet): self.stylesheets.append(block) # 4. elif isinstance(block, VTTRegion): self.regions.append(block) # 5. self.collect_characters(lambda c: c == '\n') # 15. return def collect_block(self, in_header = False): 'collect a WebVTT block' # 1. (done by class) line_count = 0 # 2. previous_position = self.position # 3. line = "" # 4. buffer = "" # 5. seen_eof = False # 6. seen_arrow = False # 7. cue = None # 8. stylesheet = None # 9. region = None # 10. # 11. while True: # 1. line = self.collect_characters(lambda c: c != '\n') # 2. line_count += 1 # 3. if self.position >= len(self.input): seen_eof = True else: self.position += 1 # 4. if '-->' in line: # 1. if not in_header and (line_count == 1 or line_count == 2 and not seen_arrow): # 1. seen_arrow = True # 2. previous_position = self.position # 3. cue = VTTCue( id = buffer, pause_on_exit = False, region = None, writing_direction = 'horizontal', snap_to_lines = True, line = 'auto', line_alignment = 'start alignment', position = 'auto', position_alignment = 'auto', cue_size = 100, text_alignment = 'center', text = '', ) # 4. if not VTTCueParser(self, line, cue).collect_cue_timings_and_settings(): cue = None else: buffer = '' self.seen_cue = True # DIFFERENCE else: self.errors.append('invalid webvtt cue block') self.position = previous_position break # 5. elif line == '': break # 6. else: # 1. if not in_header and line_count == 2: # 1. if not self.seen_cue and re.match(r'^STYLE\s*$', buffer): stylesheet = Stylesheet( location = None, parent = None, owner_node = None, owner_rule = None, media = None, title = None, alternate = False, origin_clean = True, source = None, ) buffer = '' # 2. elif not self.seen_cue and re.match(r'^REGION\s*$', buffer): region = VTTRegion( id = '', width = 100, lines = 3, anchor_point = (0, 100), viewport_anchor_point = (0, 100), scroll_value = None, ) buffer = '' # 2. if buffer != '': buffer += '\n' # 3. buffer += line # 4. previous_position = self.position # 7. if seen_eof: break # 12. if cue is not None: cue.text = buffer return cue # 13. elif stylesheet is not None: stylesheet.source = buffer return stylesheet # 14. elif region is not None: self.collect_region_settings(region, buffer) return region # 15. return None def collect_region_settings(self, region, input): 'collect WebVTT region settings' # 1. settings = re.split(SPACE_SPLIT_PATTERN, input) # 2. for setting in settings: # 1. if ':' not in setting: continue index = setting.index(':') if index in [0, len(setting) - 1]: continue # 2. name = setting[:index] # 3. value = setting[index + 1:] # 4. if name == "id": region.id = value elif name == "width": percentage = self.parse_percentage_string(value) if percentage is not None: region.width = percentage elif name == "lines": # 1. if not re.match(r'^\d+$', value): continue # 2. number = int(value) # 3. region.lines = number elif name == "regionanchor": # 1. if ',' not in value: continue #. 2. index = value.index(',') anchorX = value[:index] # 3. anchorY = value[index + 1:] # 4. percentageX = self.parse_percentage_string(anchorX) percentageY = self.parse_percentage_string(anchorY) if None in [percentageX, percentageY]: continue # 5. region.anchor_point = (percentageX, percentageY) elif name == "viewportanchor": # 1. if ',' not in value: continue #. 2. index = value.index(',') viewportanchorX = value[:index] # 3. viewportanchorY = value[index + 1:] # 4. percentageX = self.parse_percentage_string(viewportanchorX) percentageY = self.parse_percentage_string(viewportanchorY) if None in [percentageX, percentageY]: continue # 5. region.viewport_anchor_point = (percentageX, percentageY) elif name == "scroll": # 1. if value == "up": region.scroll_value = "up" # 5. continue class VTTCueParser(W3CParser): def __init__(self, parent, input, cue): self.parent = parent self.errors = self.parent.errors self.input = input self.position = 0 self.cue = cue def collect_cue_timings_and_settings(self): 'collect WebVTT cue timings and settings' # 1. (handled by class) # 2. self.position = 0 # 3. self.skip_whitespace() # 4. timestamp = self.collect_timestamp() if timestamp is None: self.errors.append('invalid start time for VTTCue') return False self.cue.start_time = timestamp # 5. self.skip_whitespace() # 6. if self.input[self.position] != '-': return False self.position += 1 # 7. if self.input[self.position] != '-': return False self.position += 1 # 8. if self.input[self.position] != '>': return False self.position += 1 # 9. self.skip_whitespace() # 10. timestamp = self.collect_timestamp() if timestamp is None: self.errors.append('invalid end time for VTTCue') return False self.cue.end_time = timestamp # 11. remainder = self.input[self.position:] # 12. self.parse_settings(remainder) # Extra return True def parse_settings(self, input): 'parse the WebVTT cue settings' # 1. settings = re.split(SPACE_SPLIT_PATTERN, input) # 2. for setting in settings: # 1. if ':' not in setting: continue index = setting.index(':') if index in [0, len(setting) - 1]: continue # 2. name = setting[:index] # 3. value = setting[index + 1:] # 4. if name == 'region': # 1. last_regions = (region for region in reversed(self.parent.regions) if region.id == value) self.cue.region = next(last_regions, None) elif name == 'vertical': # 1. and 2. if value in ['rl', 'lr']: self.cue.writing_direction = value elif name == 'line': # 1. if ',' in value: index = value.index(',') linepos = value[:index] linealign = value[index + 1:] # 2. else: linepos = value linealign = None # 3. if not re.search(r'\d', linepos): continue # 4. if linepos[-1] == '%': number = self.parse_percentage_string(linepos) if number is None: continue else: # 1. if not re.match(r'^[-\.\d]*$', linepos): continue # 2. if '-' in linepos[1:]: continue # 3. if linepos.count('.') > 1: continue # 4. if '.' in linepos: if not re.search(r'\d\.\d', linepos): continue # 5. number = float(linepos) # 5. if linealign == "start": self.cue.line_alignment = 'start' # 6. elif linealign == "center": self.cue.line_alignment = 'center' # 7. elif linealign == "end": self.cue.line_alignment = 'end' # 8. elif linealign != None: continue # 9. self.cue.line = number # 10. if linepos[-1] == '%': self.cue.snap_to_lines = False else: self.cue.snap_to_lines = True elif name == 'position': # 1. if ',' in value: index = value.index(',') colpos = value[:index] colalign = value[index + 1:] # 2. else: colpos = value colalign = None # 3. number = self.parse_percentage_string(colpos) if number is None: continue # 4. if colalign == "line-left": self.cue.line_alignment = 'line-left' # 5. elif colalign == "center": self.cue.line_alignment = 'center' # 6. elif colalign == "line-right": self.cue.line_alignment = 'line-right' # 7. elif colalign != None: continue # 8. self.cue.position = number elif name == 'size': # 1. number = self.parse_percentage_string(value) if number is None: continue # 2. self.cue.cue_size = number elif name == 'align': # 1. if value == 'start': self.cue.text_alignment = 'start' # 2. if value == 'center': self.cue.text_alignment = 'center' # 3. if value == 'end': self.cue.text_alignment = 'end' # 4. if value == 'left': self.cue.text_alignment = 'left' # 5. if value == 'right': self.cue.text_alignment = 'right' # 5. continue def collect_timestamp(self): 'collect a WebVTT timestamp' # 1. (handled by class) # 2. most_significant_units = 'minutes' # 3. if self.position >= len(self.input): return None # 4. if self.input[self.position] not in DIGITS: return None # 5. string = self.collect_characters(lambda c: c in DIGITS) # 6. value_1 = int(string) # 7. if len(string) != 2 or value_1 > 59: most_significant_units = 'hours' # 8. if self.position >= len(self.input) or self.input[self.position] != ':': return None self.position += 1 # 9. string = self.collect_characters(lambda c: c in DIGITS) # 10. if len(string) != 2: return None # 11. value_2 = int(string) # 12. if most_significant_units == 'hours' or self.position < len(self.input) and self.input[self.position] == ':': # 1. if self.position >= len(self.input) or self.input[self.position] != ':': return None self.position += 1 # 2. string = self.collect_characters(lambda c: c in DIGITS) # 3. if len(string) != 2: return None # 4. value_3 = int(string) else: value_3 = value_2 value_2 = value_1 value_1 = 0 # 13. if self.position >= len(self.input) or self.input[self.position] != '.': return None self.position += 1 # 14. string = self.collect_characters(lambda c: c in DIGITS) # 15. if len(string) != 3: return None # 16. value_4 = int(string) # 17. if value_2 >= 59 or value_3 >= 59: return None # 18. result = value_1 * 60 * 60 + value_2 * 60 + value_3 + value_4 / 1000 # 19. return result def main(argv): files = [open(path, 'r') for path in argv[1:]] try: for file in files: parser = VTTParser(file.read()) parser.parse() print("Results: {}".format(file)) print(" Cues: {}".format(parser.text_tracks)) print(" StyleSheets: {}".format(parser.stylesheets)) print(" Regions: {}".format(parser.regions)) print(" Errors: {}".format(parser.errors)) finally: for file in files: file.close() if __name__ == '__main__': import sys main(sys.argv);