summaryrefslogtreecommitdiffstats
path: root/src/tests/manconv-odd-combinations
blob: 087d6fc0e071b2aeaad76b96ada8fd8af10313ac (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
#! /bin/sh

# Test manconv's handling of various odd encoding combinations.

: "${srcdir=.}"
# shellcheck source-path=SCRIPTDIR
. "$srcdir/testlib.sh"

: "${MANCONV=manconv}"

init

(for x in $(seq 160 255); do
	printf %b "\\$(printf %03o "$x")"
done
echo) >"$tmpdir/1.inp"

iconv -f ISO-8859-1 -t UTF-8 <"$tmpdir/1.inp" >"$tmpdir/1.exp"
run $MANCONV -f UTF-8:ISO-8859-1 -t UTF-8 <"$tmpdir/1.inp" >"$tmpdir/1.out"
expect_files_equal '-f UTF-8:ISO-8859-1 -t UTF-8 on ISO-8859-1 input' \
	"$tmpdir/1.exp" "$tmpdir/1.out"

iconv -f ISO-8859-2 -t UTF-8 <"$tmpdir/1.inp" >"$tmpdir/1-latin2.exp"
run $MANCONV -f UTF-8:ISO-8859-2 -t UTF-8 \
	<"$tmpdir/1.inp" >"$tmpdir/1-latin2.out"
expect_files_equal '-f UTF-8:ISO-8859-2 -t UTF-8 on ISO-8859-2 input' \
	"$tmpdir/1-latin2.exp" "$tmpdir/1-latin2.out"

(for x in $(seq 1 1000); do
	printf '‐'
done
echo 'Б' | iconv -f UTF-8 -t KOI8-R
echo '‐') >"$tmpdir/2.inp"
iconv -f KOI8-R -t UTF-8 <"$tmpdir/2.inp" >"$tmpdir/2.exp"
run $MANCONV -f UTF-8:KOI8-R -t UTF-8 <"$tmpdir/2.inp" >"$tmpdir/2.out"
expect_files_equal \
	'-f UTF-8:KOI8-R -t UTF-8 on KOI8-R input with UTF-8 prefix' \
	"$tmpdir/2.exp" "$tmpdir/2.out"

(for x in $(seq 160 255); do
	printf %b "\\$(printf %03o "$x")"
done
echo) | iconv -f ISO-8859-1 -t UTF-8 >"$tmpdir/3.inp"
run $MANCONV -f UTF-8:ISO-8859-1 -t UTF-8 <"$tmpdir/3.inp" >"$tmpdir/3.out"
expect_files_equal '-f UTF-8:ISO-8859-1 -t UTF-8 preserves UTF-8 input' \
	"$tmpdir/3.inp" "$tmpdir/3.out"

# U+00B7 MIDDLE DOT is not representable in ISO-8859-2, and so should be
# omitted. However, manconv should still recognise that the input was UTF-8
# rather than falling back to ISO-8859-2.
cat >"$tmpdir/4.inp" <<'EOF'
š·ł
EOF
iconv -f UTF-8 -t ISO-8859-2 >"$tmpdir/4.exp" <<EOF
šł
EOF
run $MANCONV -f UTF-8:ISO-8859-2 -t ISO-8859-2//IGNORE \
	<"$tmpdir/4.inp" >"$tmpdir/4.out"
expect_files_equal \
	'recognises input encoding and omits invalid output character' \
	"$tmpdir/4.exp" "$tmpdir/4.out"

# 0xAE does not exist in ISO-8859-7, so manconv won't be able to recode this
# to UTF-8 without conversion errors. (In the original case where this was
# seen in the wild, the coding: tag should actually have read ISO-8859-13.)
iconv -f UTF-8 -t ISO-8859-13 >"$tmpdir/5.inp" <<'EOF'
'\" -*- coding: ISO-8859-7
REGISTERED SIGN: ®
trailing data
EOF
cat >"$tmpdir/5.exp" <<'EOF'
'\" -*- coding: UTF-8
EOF
<"$tmpdir/5.inp" tail -n +2 | iconv -f ISO-8859-7 -t UTF-8//IGNORE \
	>>"$tmpdir/5.exp" 2>/dev/null
run $MANCONV -f UTF-8:ISO-8859-1 -t UTF-8//IGNORE \
	<"$tmpdir/5.inp" >"$tmpdir/5.out"
expect_files_equal 'copes with invalid input characters' \
	"$tmpdir/5.exp" "$tmpdir/5.out"

finish