summaryrefslogtreecommitdiffstats
path: root/extensions/spellcheck/locales/en-US/hunspell/dictionary-sources/edit-dictionary.sh
blob: e72654e84db27358569cda3d711853cfde2efae1 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
#! /usr/bin/env sh

# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.

set -e

WKDIR="`pwd`"
SPELLER="$WKDIR/scowl/speller"

munch() {
  $SPELLER/munch-list munch $1 | sort -u
}

expand() {
  grep -v '^[0-9]\+$' | $SPELLER/munch-list expand $1 | sort -u
}

if [ ! -d "$SPELLER" ]; then
  echo "The 'scowl' folder is missing. Check the documentation at"
  echo "https://firefox-source-docs.mozilla.org/extensions/spellcheck/index.html"
  exit 1
fi

if [ -z "$EDITOR" ]; then
  echo 'Need to set the $EDITOR environment variable to your favorite editor.'
  exit 1
fi

# Open the editor and allow the user to type or paste words
echo "Editor is going to open, you can add the list of words. Quit the editor to finish editing."
echo "Press Enter to begin."
read foo
$EDITOR temp-list.txt

if [ ! -f temp-list.txt ]; then
  echo "The content of the editor hasn't been saved."
  exit 1
fi
# Remove empty lines
sed -i "" "/^$/d" temp-list.txt

# Copy the current en-US dictionary and strip the first line that contains
# the count.
tail -n +2 ../en-US.dic > en-US.stripped

# Convert the file to UTF-8
iconv -f iso-8859-1 -t utf-8 en-US.stripped > en-US.utf8
rm en-US.stripped

# Save to a temporary file words excluded from suggestions, and numerals,
# since the munched result is different for both.
grep '!$' < utf8/en-US-utf8.dic > en-US-nosug.txt
grep '^[0-9][a-z/]' < utf8/en-US-utf8.dic > en-US-numerals.txt

# Expand the dictionary to a word list
expand ../en-US.aff < en-US.utf8 > en-US-wordlist.txt
rm en-US.utf8

# Add the new words
cat temp-list.txt >> en-US-wordlist.txt
rm temp-list.txt

# Remove numerals from the expanded wordlist
grep -v '^[0-9]' < en-US-wordlist.txt > en-US-wordlist-nonum.txt
rm en-US-wordlist.txt

# Run the wordlist through the munch script, to compress the dictionary where
# possible (using affix rules).
munch ../en-US.aff < en-US-wordlist-nonum.txt > en-US-munched.dic
rm en-US-wordlist-nonum.txt

# Remove words that should not be suggested
while IFS='/' read -ra line
do
  sed -E -i "" "\:^$line($|/.*):d" en-US-munched.dic
done < "en-US-nosug.txt"

# Add back suggestion exclusions and numerals from the original .dic file
cat en-US-nosug.txt >> en-US-munched.dic
cat en-US-numerals.txt >> en-US-munched.dic
rm en-US-nosug.txt
rm en-US-numerals.txt

# Add back the line count and sort the lines
wc -l < en-US-munched.dic | tr -d '[:blank:]' > en-US.dic
LC_ALL=C sort en-US-munched.dic >> en-US.dic
rm -f en-US-munched.dic

# Convert back to ISO-8859-1
iconv -f utf-8 -t iso-8859-1 en-US.dic > ../en-US.dic

# Keep a copy of the UTF-8 file in /utf8
mv en-US.dic utf8/en-US-utf8.dic