195 lines
5.5 KiB
Go
195 lines
5.5 KiB
Go
// Copyright 2024 Garrett D'Amore
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use file except in compliance with the License.
|
|
// You may obtain a copy of the license at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
package encoding
|
|
|
|
import (
|
|
"sync"
|
|
"unicode/utf8"
|
|
|
|
"golang.org/x/text/encoding"
|
|
"golang.org/x/text/transform"
|
|
)
|
|
|
|
const (
|
|
// RuneError is an alias for the UTF-8 replacement rune, '\uFFFD'.
|
|
RuneError = '\uFFFD'
|
|
|
|
// RuneSelf is the rune below which UTF-8 and the Unicode values are
|
|
// identical. Its also the limit for ASCII.
|
|
RuneSelf = 0x80
|
|
|
|
// ASCIISub is the ASCII substitution character.
|
|
ASCIISub = '\x1a'
|
|
)
|
|
|
|
// Charmap is a structure for setting up encodings for 8-bit character sets,
|
|
// for transforming between UTF8 and that other character set. It has some
|
|
// ideas borrowed from golang.org/x/text/encoding/charmap, but it uses a
|
|
// different implementation. This implementation uses maps, and supports
|
|
// user-defined maps.
|
|
//
|
|
// We do assume that a character map has a reasonable substitution character,
|
|
// and that valid encodings are stable (exactly a 1:1 map) and stateless
|
|
// (that is there is no shift character or anything like that.) Hence this
|
|
// approach will not work for many East Asian character sets.
|
|
//
|
|
// Measurement shows little or no measurable difference in the performance of
|
|
// the two approaches. The difference was down to a couple of nsec/op, and
|
|
// no consistent pattern as to which ran faster. With the conversion to
|
|
// UTF-8 the code takes about 25 nsec/op. The conversion in the reverse
|
|
// direction takes about 100 nsec/op. (The larger cost for conversion
|
|
// from UTF-8 is most likely due to the need to convert the UTF-8 byte stream
|
|
// to a rune before conversion.
|
|
type Charmap struct {
|
|
transform.NopResetter
|
|
bytes map[rune]byte
|
|
runes [256][]byte
|
|
once sync.Once
|
|
|
|
// The map between bytes and runes. To indicate that a specific
|
|
// byte value is invalid for a charcter set, use the rune
|
|
// utf8.RuneError. Values that are absent from this map will
|
|
// be assumed to have the identity mapping -- that is the default
|
|
// is to assume ISO8859-1, where all 8-bit characters have the same
|
|
// numeric value as their Unicode runes. (Not to be confused with
|
|
// the UTF-8 values, which *will* be different for non-ASCII runes.)
|
|
//
|
|
// If no values less than RuneSelf are changed (or have non-identity
|
|
// mappings), then the character set is assumed to be an ASCII
|
|
// superset, and certain assumptions and optimizations become
|
|
// available for ASCII bytes.
|
|
Map map[byte]rune
|
|
|
|
// The ReplacementChar is the byte value to use for substitution.
|
|
// It should normally be ASCIISub for ASCII encodings. This may be
|
|
// unset (left to zero) for mappings that are strictly ASCII supersets.
|
|
// In that case ASCIISub will be assumed instead.
|
|
ReplacementChar byte
|
|
}
|
|
|
|
type cmapDecoder struct {
|
|
transform.NopResetter
|
|
runes [256][]byte
|
|
}
|
|
|
|
type cmapEncoder struct {
|
|
transform.NopResetter
|
|
bytes map[rune]byte
|
|
replace byte
|
|
}
|
|
|
|
// Init initializes internal values of a character map. This should
|
|
// be done early, to minimize the cost of allocation of transforms
|
|
// later. It is not strictly necessary however, as the allocation
|
|
// functions will arrange to call it if it has not already been done.
|
|
func (c *Charmap) Init() {
|
|
c.once.Do(c.initialize)
|
|
}
|
|
|
|
func (c *Charmap) initialize() {
|
|
c.bytes = make(map[rune]byte)
|
|
ascii := true
|
|
|
|
for i := 0; i < 256; i++ {
|
|
r, ok := c.Map[byte(i)]
|
|
if !ok {
|
|
r = rune(i)
|
|
}
|
|
if r < 128 && r != rune(i) {
|
|
ascii = false
|
|
}
|
|
if r != RuneError {
|
|
c.bytes[r] = byte(i)
|
|
}
|
|
utf := make([]byte, utf8.RuneLen(r))
|
|
utf8.EncodeRune(utf, r)
|
|
c.runes[i] = utf
|
|
}
|
|
if ascii && c.ReplacementChar == '\x00' {
|
|
c.ReplacementChar = ASCIISub
|
|
}
|
|
}
|
|
|
|
// NewDecoder returns a Decoder the converts from the 8-bit
|
|
// character set to UTF-8. Unknown mappings, if any, are mapped
|
|
// to '\uFFFD'.
|
|
func (c *Charmap) NewDecoder() *encoding.Decoder {
|
|
c.Init()
|
|
return &encoding.Decoder{Transformer: &cmapDecoder{runes: c.runes}}
|
|
}
|
|
|
|
// NewEncoder returns a Transformer that converts from UTF8 to the
|
|
// 8-bit character set. Unknown mappings are mapped to 0x1A.
|
|
func (c *Charmap) NewEncoder() *encoding.Encoder {
|
|
c.Init()
|
|
return &encoding.Encoder{
|
|
Transformer: &cmapEncoder{
|
|
bytes: c.bytes,
|
|
replace: c.ReplacementChar,
|
|
},
|
|
}
|
|
}
|
|
|
|
func (d *cmapDecoder) Transform(dst, src []byte, atEOF bool) (int, int, error) {
|
|
var e error
|
|
var ndst, nsrc int
|
|
|
|
for _, c := range src {
|
|
b := d.runes[c]
|
|
l := len(b)
|
|
|
|
if ndst+l > len(dst) {
|
|
e = transform.ErrShortDst
|
|
break
|
|
}
|
|
for i := 0; i < l; i++ {
|
|
dst[ndst] = b[i]
|
|
ndst++
|
|
}
|
|
nsrc++
|
|
}
|
|
return ndst, nsrc, e
|
|
}
|
|
|
|
func (d *cmapEncoder) Transform(dst, src []byte, atEOF bool) (int, int, error) {
|
|
var e error
|
|
var ndst, nsrc int
|
|
for nsrc < len(src) {
|
|
if ndst >= len(dst) {
|
|
e = transform.ErrShortDst
|
|
break
|
|
}
|
|
|
|
r, sz := utf8.DecodeRune(src[nsrc:])
|
|
if r == utf8.RuneError && sz == 1 {
|
|
// If its inconclusive due to insufficient data in
|
|
// in the source, report it
|
|
if atEOF && !utf8.FullRune(src[nsrc:]) {
|
|
e = transform.ErrShortSrc
|
|
break
|
|
}
|
|
}
|
|
|
|
if c, ok := d.bytes[r]; ok {
|
|
dst[ndst] = c
|
|
} else {
|
|
dst[ndst] = d.replace
|
|
}
|
|
nsrc += sz
|
|
ndst++
|
|
}
|
|
|
|
return ndst, nsrc, e
|
|
}
|