summaryrefslogtreecommitdiffstats
path: root/extensions/spellcheck/locales/en-US/hunspell/dictionary-sources/make-new-dict.sh
blob: de06dfe3d6d2d29ecd521e7164d962ff9f46bd3f (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
#! /usr/bin/env sh

# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.

# This script creates a new dictionary by expanding the original,
# Mozilla's, and the upstream dictionary to remove affix flags and
# then doing the wordlist equivalent of diff3 to create a new
# dictionary.
#
# The files 2-mozilla-add and 2-mozilla-rem contain words added and
# removed, respectively in the Mozilla dictionary. The final
# dictionary will be in hunspell-en_US-mozilla.zip.

set -e

export LANG=C
export LC_ALL=C
export LC_CTYPE=C
export LC_COLLATE=C

WKDIR="`pwd`"
ORIG="$WKDIR/orig"
SUPPORT_DIR="$WKDIR/support_files"
SPELLER="$WKDIR/scowl/speller"

# This is required by scowl scripts
export SCOWL="$WKDIR/scowl/"

expand() {
  grep -v '^[0-9]\+$' | $SPELLER/munch-list expand $1 | sort -u
}

if [ ! -d "$SPELLER" ]; then
  echo "The 'scowl' folder is missing. Check the documentation at"
  echo "https://firefox-source-docs.mozilla.org/extensions/spellcheck/index.html"
  exit 1
fi

mkdir -p $SUPPORT_DIR
cd $SPELLER
MK_LIST="../mk-list -v1 --accents=both en_US 60"
cat <<EOF > params.txt
With Input Command: $MK_LIST
EOF
# Note: the output of make-hunspell-dict is UTF-8
$MK_LIST | ./make-hunspell-dict -one en_US-custom params.txt > ./make-hunspell-dict.log
cd $WKDIR

# Note: Input and output of "expand" is always ISO-8859-1.
#       All expanded word list files are thus in ISO-8859-1.
expand $SPELLER/en.aff < $SPELLER/en.dic.supp > $SUPPORT_DIR/0-special.txt

# Input is UTF-8, expand expects ISO-8859-1 so use iconv
iconv -f utf-8 -t iso-8859-1 $ORIG/en_US-custom.dic | expand $ORIG/en_US-custom.aff > $SUPPORT_DIR/1-base.txt

# Store suggestion exclusions (ending with !) defined in current Mozilla dictionary.
# Save both the compressed (munched) and expanded version.
grep '!$' ../en-US.dic > $SUPPORT_DIR/2-mozilla-nosug-munched.txt
expand ../en-US.aff < $SUPPORT_DIR/2-mozilla-nosug-munched.txt > $SUPPORT_DIR/2-mozilla-nosug.txt

# Remove suggestion exclusions and expand the existing Mozilla dictionary.
# The existing Mozilla dictionary is already in ISO-8859-1.
grep -v '!$' < ../en-US.dic > $SUPPORT_DIR/en-US-nosug.dic
expand ../en-US.aff < $SUPPORT_DIR/en-US-nosug.dic > $SUPPORT_DIR/2-mozilla.txt
rm $SUPPORT_DIR/en-US-nosug.dic

# Input is UTF-8, expand expects ISO-8859-1 so use iconv
iconv -f utf-8 -t iso-8859-1 $SPELLER/en_US-custom.dic | expand $SPELLER/en_US-custom.aff > $SUPPORT_DIR/3-upstream.txt

# Suppress common lines and lines only in the 2nd file, leaving words that are
# only available in the 1st file (SCOWL), i.e. were removed by Mozilla.
comm -23 $SUPPORT_DIR/1-base.txt $SUPPORT_DIR/2-mozilla.txt > $SUPPORT_DIR/2-mozilla-removed.txt

# Suppress common lines and lines only in the 1st file, leaving words that are
# only available in the 2nd file (current Mozilla dictionary), i.e. were added
# by Mozilla.
comm -13 $SUPPORT_DIR/1-base.txt $SUPPORT_DIR/2-mozilla.txt > $SUPPORT_DIR/2-mozilla-added.txt

# Suppress common lines and lines only in the 2nd file, leaving words that are
# only available in the 1st file (words from the new upstream SCOWL dictionary).
# The result is upstream, minus the words removed, plus the words added.
comm -23 $SUPPORT_DIR/3-upstream.txt $SUPPORT_DIR/2-mozilla-removed.txt | cat - $SUPPORT_DIR/2-mozilla-added.txt | sort -u > $SUPPORT_DIR/4-patched.txt

# Note: the output of make-hunspell-dict is UTF-8
cat $SUPPORT_DIR/4-patched.txt | comm -23 - $SUPPORT_DIR/0-special.txt | $SPELLER/make-hunspell-dict -one en_US-mozilla /dev/null

# Add back Mozilla suggestion exclusions. Need to convert the file from
# ISO-8859-1 to UTF-8 first, then add back the line count and reorder.
tail -n +2 en_US-mozilla.dic > en_US-mozilla-complete.dic
iconv -f iso-8859-1 -t utf-8 $SUPPORT_DIR/2-mozilla-nosug-munched.txt >> en_US-mozilla-complete.dic
wc -l < en_US-mozilla-complete.dic | tr -d '[:blank:]' > en_US-mozilla.dic
LC_ALL=C sort en_US-mozilla-complete.dic >> en_US-mozilla.dic
rm -f en_US-mozilla-complete.dic

# Sanity check should yield identical results
#comm -23 $SUPPORT_DIR/1-base.txt $SUPPORT_DIR/3-upstream.txt > $SUPPORT_DIR/3-upstream-remover.txt
#comm -13 $SUPPORT_DIR/1-base.txt $SUPPORT_DIR/3-upstream.txt > $SUPPORT_DIR/3-upstream-added.txt
#comm -23 $SUPPORT_DIR/2-mozilla.txt $SUPPORT_DIR/3-upstream-removed.txt | cat - $SUPPORT_DIR/3-upstream-added.txt | sort -u > $SUPPORT_DIR/4-patched-v2.txt

expand ../en-US.aff < mozilla-specific.txt > 5-mozilla-specific.txt

# Update Mozilla removed and added wordlists based on the new upstream
# dictionary, save them as UTF-8 and not ISO-8951-1.
# Ignore words excluded from suggestions for both files.
comm -12 $SUPPORT_DIR/3-upstream.txt $SUPPORT_DIR/2-mozilla-removed.txt > $SUPPORT_DIR/5-mozilla-removed-tmp.txt
comm -23 $SUPPORT_DIR/5-mozilla-removed-tmp.txt $SUPPORT_DIR/2-mozilla-nosug.txt > $SUPPORT_DIR/5-mozilla-removed.txt
rm $SUPPORT_DIR/5-mozilla-removed-tmp.txt
iconv -f iso-8859-1 -t utf-8 $SUPPORT_DIR/5-mozilla-removed.txt > 5-mozilla-removed.txt

comm -13 $SUPPORT_DIR/3-upstream.txt $SUPPORT_DIR/2-mozilla-added.txt > $SUPPORT_DIR/5-mozilla-added-tmp.txt
comm -23 $SUPPORT_DIR/5-mozilla-added-tmp.txt $SUPPORT_DIR/2-mozilla-nosug.txt > $SUPPORT_DIR/5-mozilla-added.txt
rm $SUPPORT_DIR/5-mozilla-added-tmp.txt
iconv -f iso-8859-1 -t utf-8 $SUPPORT_DIR/5-mozilla-added.txt > 5-mozilla-added.txt

# Clean up some files
rm hunspell-en_US-mozilla.zip
rm nosug

# Remove backup folders in preparation for the install-new-dict script
FOLDERS=( "orig-bk" "mozilla-bk")
for f in ${FOLDERS[@]}; do
  if [ -d "$SUPPORT_DIR/$f" ]; then
    echo "Removing backup folder $f"
    rm -rf "$SUPPORT_DIR/$f"
  fi
done