diff options
Diffstat (limited to '')
-rw-r--r-- | src/unicode/graphic.go | 144 |
1 files changed, 144 insertions, 0 deletions
diff --git a/src/unicode/graphic.go b/src/unicode/graphic.go new file mode 100644 index 0000000..ca62419 --- /dev/null +++ b/src/unicode/graphic.go @@ -0,0 +1,144 @@ +// Copyright 2011 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package unicode + +// Bit masks for each code point under U+0100, for fast lookup. +const ( + pC = 1 << iota // a control character. + pP // a punctuation character. + pN // a numeral. + pS // a symbolic character. + pZ // a spacing character. + pLu // an upper-case letter. + pLl // a lower-case letter. + pp // a printable character according to Go's definition. + pg = pp | pZ // a graphical character according to the Unicode definition. + pLo = pLl | pLu // a letter that is neither upper nor lower case. + pLmask = pLo +) + +// GraphicRanges defines the set of graphic characters according to Unicode. +var GraphicRanges = []*RangeTable{ + L, M, N, P, S, Zs, +} + +// PrintRanges defines the set of printable characters according to Go. +// ASCII space, U+0020, is handled separately. +var PrintRanges = []*RangeTable{ + L, M, N, P, S, +} + +// IsGraphic reports whether the rune is defined as a Graphic by Unicode. +// Such characters include letters, marks, numbers, punctuation, symbols, and +// spaces, from categories L, M, N, P, S, Zs. +func IsGraphic(r rune) bool { + // We convert to uint32 to avoid the extra test for negative, + // and in the index we convert to uint8 to avoid the range check. + if uint32(r) <= MaxLatin1 { + return properties[uint8(r)]&pg != 0 + } + return In(r, GraphicRanges...) +} + +// IsPrint reports whether the rune is defined as printable by Go. Such +// characters include letters, marks, numbers, punctuation, symbols, and the +// ASCII space character, from categories L, M, N, P, S and the ASCII space +// character. This categorization is the same as IsGraphic except that the +// only spacing character is ASCII space, U+0020. +func IsPrint(r rune) bool { + if uint32(r) <= MaxLatin1 { + return properties[uint8(r)]&pp != 0 + } + return In(r, PrintRanges...) +} + +// IsOneOf reports whether the rune is a member of one of the ranges. +// The function "In" provides a nicer signature and should be used in preference to IsOneOf. +func IsOneOf(ranges []*RangeTable, r rune) bool { + for _, inside := range ranges { + if Is(inside, r) { + return true + } + } + return false +} + +// In reports whether the rune is a member of one of the ranges. +func In(r rune, ranges ...*RangeTable) bool { + for _, inside := range ranges { + if Is(inside, r) { + return true + } + } + return false +} + +// IsControl reports whether the rune is a control character. +// The C (Other) Unicode category includes more code points +// such as surrogates; use Is(C, r) to test for them. +func IsControl(r rune) bool { + if uint32(r) <= MaxLatin1 { + return properties[uint8(r)]&pC != 0 + } + // All control characters are < MaxLatin1. + return false +} + +// IsLetter reports whether the rune is a letter (category L). +func IsLetter(r rune) bool { + if uint32(r) <= MaxLatin1 { + return properties[uint8(r)]&(pLmask) != 0 + } + return isExcludingLatin(Letter, r) +} + +// IsMark reports whether the rune is a mark character (category M). +func IsMark(r rune) bool { + // There are no mark characters in Latin-1. + return isExcludingLatin(Mark, r) +} + +// IsNumber reports whether the rune is a number (category N). +func IsNumber(r rune) bool { + if uint32(r) <= MaxLatin1 { + return properties[uint8(r)]&pN != 0 + } + return isExcludingLatin(Number, r) +} + +// IsPunct reports whether the rune is a Unicode punctuation character +// (category P). +func IsPunct(r rune) bool { + if uint32(r) <= MaxLatin1 { + return properties[uint8(r)]&pP != 0 + } + return Is(Punct, r) +} + +// IsSpace reports whether the rune is a space character as defined +// by Unicode's White Space property; in the Latin-1 space +// this is +// '\t', '\n', '\v', '\f', '\r', ' ', U+0085 (NEL), U+00A0 (NBSP). +// Other definitions of spacing characters are set by category +// Z and property Pattern_White_Space. +func IsSpace(r rune) bool { + // This property isn't the same as Z; special-case it. + if uint32(r) <= MaxLatin1 { + switch r { + case '\t', '\n', '\v', '\f', '\r', ' ', 0x85, 0xA0: + return true + } + return false + } + return isExcludingLatin(White_Space, r) +} + +// IsSymbol reports whether the rune is a symbolic character. +func IsSymbol(r rune) bool { + if uint32(r) <= MaxLatin1 { + return properties[uint8(r)]&pS != 0 + } + return isExcludingLatin(Symbol, r) +} |