src/seastar/doc/htmlsplit.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88

#!/usr/bin/env python3

# This script takes the single-page HTML output from pandoc - tutorial.html -
# and splits it into many pages in split/: one page index.html for the table
# of contents, and an additional page for each chapter. We make sure that
# links from the TOC to each chapter, and also links across chapters,
# continue to work correctly, and also had links from each chapter back to
# the TOC, as well as to the next and previous chapters.


# Copyright (C) 2018 ScyllaDB.
#
# This file is open source software, licensed to you under the terms
# of the Apache License, Version 2.0 (the "License").  See the NOTICE file
# distributed with this work for additional information regarding copyright
# ownership.  You may not use this file except in compliance with the License.
#
# You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.

import re
titles = {}
sections = {}
def links(out, chapter):
    if chapter == 0:
        return
    out.write('<A HREF="index.html">Back to table of contents</A>. ')
    try:
        out.write('Previous: <A HREF="' + str(chapter-1) +'.html">' + str(chapter-1) + '. ' + titles[chapter-1] + '</A>. ')
    except:
        pass
    try:
        out.write('Next: <A HREF="' + str(chapter+1) +'.html">' + str(chapter+1) + '. ' + titles[chapter+1] + '</A>. ')
    except:
        pass
def flush(chapter, header, chunk):
    fn = 'index.html' if chapter == 0 else str(chapter) + '.html'
    with open('split/' + fn, 'w') as out:
        out.write(header)
        links(out, chapter)
        out.write(chunk)
        links(out, chapter)
        out.write('</body></html>')
with open("tutorial.html") as f:
    chunk = ""
    # Chapter currently being read. Set to 0 while reading the TOC, or
    # numbers > 0 while reading a chapter
    chapter = None
    for line in f:
        if line == '<div id="TOC">\n' or line =='<nav id="TOC">\n':
            header = chunk
            chapter = 0
            chunk = ""
        elif line.startswith('<h1 id="'):
            flush(chapter, header, chunk)
            chunk = ""
            chapter += 1
        elif chapter == 0 and line.startswith('<li><a href="#'):
            # For all sections, remember the mapping from name-with-dashes
            # to the chapter number they are in in "sections". We need this
            # to support links to other sections.
            match = re.search('href="#([^"]*)".*>([0-9]+)[.<]', line)
            if match:
                sections[match.group(1)] = match.group(2)
            # replace the link to '#section' with number N.M to chapterN#section
            match = re.match('^(.*href=")(#.*>)([0-9]+)([.<].*)$', line)
            line = match.group(1) + match.group(3) + '.html' + match.group(2) + match.group(3) + match.group(4) + '\n'
            # For chapters, remember the mapping from number to name in the
            # map "titles", so we can use them later in links to next and
            # previous chapter
            match = re.search('>([0-9]+)</span> (.*)</a>', line)
            if match:
                titles[int(match.group(1))] = match.group(2)
        elif chapter != 0:
            # In a chapter we can have a link to a different subsection, which
            # looks like <a href="#some-title">Some title</A>. We need to
            # replace this to refer to the right file after the split.
            line = re.sub('<a href="#([^"]*)">([^<]*)</a>', lambda m: '<a href="' + sections[m.group(1)] + '.html#' + m.group(1) + '">' + m.group(2) + '</a>', line)
        chunk += line
    flush(chapter, header, chunk)