summaryrefslogtreecommitdiffstats
path: root/upstream/mageia-cauldron/man3pm/Encode::Guess.3pm
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-15 19:43:11 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-15 19:43:11 +0000
commitfc22b3d6507c6745911b9dfcc68f1e665ae13dbc (patch)
treece1e3bce06471410239a6f41282e328770aa404a /upstream/mageia-cauldron/man3pm/Encode::Guess.3pm
parentInitial commit. (diff)
downloadmanpages-l10n-fc22b3d6507c6745911b9dfcc68f1e665ae13dbc.tar.xz
manpages-l10n-fc22b3d6507c6745911b9dfcc68f1e665ae13dbc.zip
Adding upstream version 4.22.0.upstream/4.22.0
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'upstream/mageia-cauldron/man3pm/Encode::Guess.3pm')
-rw-r--r--upstream/mageia-cauldron/man3pm/Encode::Guess.3pm245
1 files changed, 245 insertions, 0 deletions
diff --git a/upstream/mageia-cauldron/man3pm/Encode::Guess.3pm b/upstream/mageia-cauldron/man3pm/Encode::Guess.3pm
new file mode 100644
index 00000000..2fcfac1d
--- /dev/null
+++ b/upstream/mageia-cauldron/man3pm/Encode::Guess.3pm
@@ -0,0 +1,245 @@
+.\" -*- mode: troff; coding: utf-8 -*-
+.\" Automatically generated by Pod::Man 5.01 (Pod::Simple 3.43)
+.\"
+.\" Standard preamble:
+.\" ========================================================================
+.de Sp \" Vertical space (when we can't use .PP)
+.if t .sp .5v
+.if n .sp
+..
+.de Vb \" Begin verbatim text
+.ft CW
+.nf
+.ne \\$1
+..
+.de Ve \" End verbatim text
+.ft R
+.fi
+..
+.\" \*(C` and \*(C' are quotes in nroff, nothing in troff, for use with C<>.
+.ie n \{\
+. ds C` ""
+. ds C' ""
+'br\}
+.el\{\
+. ds C`
+. ds C'
+'br\}
+.\"
+.\" Escape single quotes in literal strings from groff's Unicode transform.
+.ie \n(.g .ds Aq \(aq
+.el .ds Aq '
+.\"
+.\" If the F register is >0, we'll generate index entries on stderr for
+.\" titles (.TH), headers (.SH), subsections (.SS), items (.Ip), and index
+.\" entries marked with X<> in POD. Of course, you'll have to process the
+.\" output yourself in some meaningful fashion.
+.\"
+.\" Avoid warning from groff about undefined register 'F'.
+.de IX
+..
+.nr rF 0
+.if \n(.g .if rF .nr rF 1
+.if (\n(rF:(\n(.g==0)) \{\
+. if \nF \{\
+. de IX
+. tm Index:\\$1\t\\n%\t"\\$2"
+..
+. if !\nF==2 \{\
+. nr % 0
+. nr F 2
+. \}
+. \}
+.\}
+.rr rF
+.\" ========================================================================
+.\"
+.IX Title "Encode::Guess 3pm"
+.TH Encode::Guess 3pm 2023-11-28 "perl v5.38.2" "Perl Programmers Reference Guide"
+.\" For nroff, turn off justification. Always turn off hyphenation; it makes
+.\" way too many mistakes in technical documents.
+.if n .ad l
+.nh
+.SH NAME
+Encode::Guess \-\- Guesses encoding from data
+.SH SYNOPSIS
+.IX Header "SYNOPSIS"
+.Vb 1
+\& # if you are sure $data won\*(Aqt contain anything bogus
+\&
+\& use Encode;
+\& use Encode::Guess qw/euc\-jp shiftjis 7bit\-jis/;
+\& my $utf8 = decode("Guess", $data);
+\& my $data = encode("Guess", $utf8); # this doesn\*(Aqt work!
+\&
+\& # more elaborate way
+\& use Encode::Guess;
+\& my $enc = guess_encoding($data, qw/euc\-jp shiftjis 7bit\-jis/);
+\& ref($enc) or die "Can\*(Aqt guess: $enc"; # trap error this way
+\& $utf8 = $enc\->decode($data);
+\& # or
+\& $utf8 = decode($enc\->name, $data)
+.Ve
+.SH ABSTRACT
+.IX Header "ABSTRACT"
+Encode::Guess enables you to guess in what encoding a given data is
+encoded, or at least tries to.
+.SH DESCRIPTION
+.IX Header "DESCRIPTION"
+By default, it checks only ascii, utf8 and UTF\-16/32 with BOM.
+.PP
+.Vb 1
+\& use Encode::Guess; # ascii/utf8/BOMed UTF
+.Ve
+.PP
+To use it more practically, you have to give the names of encodings to
+check (\fIsuspects\fR as follows). The name of suspects can either be
+canonical names or aliases.
+.PP
+CAVEAT: Unlike UTF\-(16|32), BOM in utf8 is NOT AUTOMATICALLY STRIPPED.
+.PP
+.Vb 2
+\& # tries all major Japanese Encodings as well
+\& use Encode::Guess qw/euc\-jp shiftjis 7bit\-jis/;
+.Ve
+.PP
+If the \f(CW$Encode::Guess::NoUTFAutoGuess\fR variable is set to a true
+value, no heuristics will be applied to UTF8/16/32, and the result
+will be limited to the suspects and \f(CW\*(C`ascii\*(C'\fR.
+.IP Encode::Guess\->set_suspects 4
+.IX Item "Encode::Guess->set_suspects"
+You can also change the internal suspects list via \f(CW\*(C`set_suspects\*(C'\fR
+method.
+.Sp
+.Vb 2
+\& use Encode::Guess;
+\& Encode::Guess\->set_suspects(qw/euc\-jp shiftjis 7bit\-jis/);
+.Ve
+.IP Encode::Guess\->add_suspects 4
+.IX Item "Encode::Guess->add_suspects"
+Or you can use \f(CW\*(C`add_suspects\*(C'\fR method. The difference is that
+\&\f(CW\*(C`set_suspects\*(C'\fR flushes the current suspects list while
+\&\f(CW\*(C`add_suspects\*(C'\fR adds.
+.Sp
+.Vb 5
+\& use Encode::Guess;
+\& Encode::Guess\->add_suspects(qw/euc\-jp shiftjis 7bit\-jis/);
+\& # now the suspects are euc\-jp,shiftjis,7bit\-jis, AND
+\& # euc\-kr,euc\-cn, and big5\-eten
+\& Encode::Guess\->add_suspects(qw/euc\-kr euc\-cn big5\-eten/);
+.Ve
+.IP "Encode::decode(""Guess"" ...)" 4
+.IX Item "Encode::decode(""Guess"" ...)"
+When you are content with suspects list, you can now
+.Sp
+.Vb 1
+\& my $utf8 = Encode::decode("Guess", $data);
+.Ve
+.IP Encode::Guess\->guess($data) 4
+.IX Item "Encode::Guess->guess($data)"
+But it will croak if:
+.RS 4
+.IP \(bu 4
+Two or more suspects remain
+.IP \(bu 4
+No suspects left
+.RE
+.RS 4
+.Sp
+So you should instead try this;
+.Sp
+.Vb 1
+\& my $decoder = Encode::Guess\->guess($data);
+.Ve
+.Sp
+On success, \f(CW$decoder\fR is an object that is documented in
+Encode::Encoding. So you can now do this;
+.Sp
+.Vb 1
+\& my $utf8 = $decoder\->decode($data);
+.Ve
+.Sp
+On failure, \f(CW$decoder\fR now contains an error message so the whole thing
+would be as follows;
+.Sp
+.Vb 3
+\& my $decoder = Encode::Guess\->guess($data);
+\& die $decoder unless ref($decoder);
+\& my $utf8 = $decoder\->decode($data);
+.Ve
+.RE
+.IP "guess_encoding($data, [, \fIlist of suspects\fR])" 4
+.IX Item "guess_encoding($data, [, list of suspects])"
+You can also try \f(CW\*(C`guess_encoding\*(C'\fR function which is exported by
+default. It takes \f(CW$data\fR to check and it also takes the list of
+suspects by option. The optional suspect list is \fInot reflected\fR to
+the internal suspects list.
+.Sp
+.Vb 5
+\& my $decoder = guess_encoding($data, qw/euc\-jp euc\-kr euc\-cn/);
+\& die $decoder unless ref($decoder);
+\& my $utf8 = $decoder\->decode($data);
+\& # check only ascii, utf8 and UTF\-(16|32) with BOM
+\& my $decoder = guess_encoding($data);
+.Ve
+.SH CAVEATS
+.IX Header "CAVEATS"
+.IP \(bu 4
+Because of the algorithm used, ISO\-8859 series and other single-byte
+encodings do not work well unless either one of ISO\-8859 is the only
+one suspect (besides ascii and utf8).
+.Sp
+.Vb 5
+\& use Encode::Guess;
+\& # perhaps ok
+\& my $decoder = guess_encoding($data, \*(Aqlatin1\*(Aq);
+\& # definitely NOT ok
+\& my $decoder = guess_encoding($data, qw/latin1 greek/);
+.Ve
+.Sp
+The reason is that Encode::Guess guesses encoding by trial and error.
+It first splits \f(CW$data\fR into lines and tries to decode the line for each
+suspect. It keeps it going until all but one encoding is eliminated
+out of suspects list. ISO\-8859 series is just too successful for most
+cases (because it fills almost all code points in \ex00\-\exff).
+.IP \(bu 4
+Do not mix national standard encodings and the corresponding vendor
+encodings.
+.Sp
+.Vb 3
+\& # a very bad idea
+\& my $decoder
+\& = guess_encoding($data, qw/shiftjis MacJapanese cp932/);
+.Ve
+.Sp
+The reason is that vendor encoding is usually a superset of national
+standard so it becomes too ambiguous for most cases.
+.IP \(bu 4
+On the other hand, mixing various national standard encodings
+automagically works unless \f(CW$data\fR is too short to allow for guessing.
+.Sp
+.Vb 6
+\& # This is ok if $data is long enough
+\& my $decoder =
+\& guess_encoding($data, qw/euc\-cn
+\& euc\-jp shiftjis 7bit\-jis
+\& euc\-kr
+\& big5\-eten/);
+.Ve
+.IP \(bu 4
+DO NOT PUT TOO MANY SUSPECTS! Don't you try something like this!
+.Sp
+.Vb 2
+\& my $decoder = guess_encoding($data,
+\& Encode\->encodings(":all"));
+.Ve
+.PP
+It is, after all, just a guess. You should alway be explicit when it
+comes to encodings. But there are some, especially Japanese,
+environment that guess-coding is a must. Use this module with care.
+.SH "TO DO"
+.IX Header "TO DO"
+Encode::Guess does not work on EBCDIC platforms.
+.SH "SEE ALSO"
+.IX Header "SEE ALSO"
+Encode, Encode::Encoding