diff --git a/.golangci.yml b/.golangci.yml index 0e7453f..524d13f 100644 --- a/.golangci.yml +++ b/.golangci.yml @@ -11,6 +11,7 @@ linters: - exhaustive - ireturn - wrapcheck + - varnamelen settings: lll: line-length: 160 diff --git a/utfbom.go b/utfbom.go index 421c90e..c5ab8d6 100644 --- a/utfbom.go +++ b/utfbom.go @@ -71,6 +71,10 @@ func DetectEncoding[T string | []byte](input T) Encoding { return Unknown } + if len(ibs) >= 3 && bytes.HasPrefix(ibs, utf8BOM) { + return UTF8 + } + if len(ibs) >= 4 { if bytes.HasPrefix(ibs, utf32BEBOM) { return UTF32BigEndian @@ -81,10 +85,6 @@ func DetectEncoding[T string | []byte](input T) Encoding { } } - if len(ibs) >= 3 && bytes.HasPrefix(ibs, utf8BOM) { - return UTF8 - } - if bytes.HasPrefix(ibs, utf16BEBOM) { return UTF16BigEndian } @@ -140,17 +140,52 @@ func (e Encoding) Len() int { } } +// Bytes returns encoding bytes. +func (e Encoding) Bytes() []byte { + switch e { + default: + return nil + case UTF8: + return utf8BOM + case UTF16BigEndian: + return utf16BEBOM + case UTF16LittleEndian: + return utf16LEBOM + case UTF32BigEndian: + return utf32BEBOM + case UTF32LittleEndian: + return utf32LEBOM + } +} + // Trim removes the BOM prefix from the input `s` based on the encoding `enc`. // Supports string or []byte inputs and returns the same type without the BOM. func Trim[T string | []byte](input T) (T, Encoding) { - bytes := []byte(input) - enc := DetectEncoding(bytes) + b := []byte(input) + enc := DetectEncoding(b) if enc == Unknown { return input, enc } - return T(bytes[enc.Len():]), enc + return T(b[enc.Len():]), enc +} + +// Prepend adds the corresponding Byte Order Mark (BOM) for a given encoding +// to the beginning of a string or byte slice. +// If the provided encoding is Unknown, the input is returned unmodified. +func Prepend[T string | []byte](input T, enc Encoding) T { + if enc == Unknown { + return input + } + + b := []byte(input) + + if DetectEncoding(b) != Unknown { + return input + } + + return T(append(enc.Bytes(), b...)) } // Reader implements automatic BOM (Unicode Byte Order Mark) checking and @@ -186,7 +221,7 @@ func (r *Reader) Read(buf []byte) (int, error) { var bomErr error r.once.Do(func() { - bytes, err := r.rd.Peek(maxBOMLen) + b, err := r.rd.Peek(maxBOMLen) // do not error out in case underlying payload is too small // still attempt to read fewer than n bytes. if err != nil && !errors.Is(err, io.EOF) && !errors.Is(err, io.ErrUnexpectedEOF) { @@ -195,7 +230,7 @@ func (r *Reader) Read(buf []byte) (int, error) { return } - r.Enc = DetectEncoding(bytes) + r.Enc = DetectEncoding(b) if r.Enc != Unknown { _, err = r.rd.Discard(r.Enc.Len()) if err != nil { diff --git a/utfbom_test.go b/utfbom_test.go index bda6700..012a6e8 100644 --- a/utfbom_test.go +++ b/utfbom_test.go @@ -14,6 +14,14 @@ import ( "github.com/slash3b/utfbom" ) +var ( + utf8BOM = []byte{0xef, 0xbb, 0xbf} + utf16BEBOM = []byte{0xfe, 0xff} + utf16LEBOM = []byte{0xff, 0xfe} + utf32BEBOM = []byte{0x00, 0x00, 0xfe, 0xff} + utf32LEBOM = []byte{0xff, 0xfe, 0x00, 0x00} +) + func TestDetectBom(t *testing.T) { testCases := []struct { name string @@ -127,6 +135,36 @@ func ExampleTrim() { // output bytes:0x68656c6c6f } +func ExamplePrepend() { + // Prepend a UTF-8 BOM to a simple string. + // The UTF-8 BOM is represented by the rune \ufeff. + withBOM := utfbom.Prepend("hello", utfbom.UTF8) + fmt.Printf("String with UTF-8 BOM: %q\n", withBOM) + fmt.Printf("Bytes: %#x\n\n", withBOM) + + // Prepend a UTF-16LE BOM to a byte slice that is also UTF-16LE encoded. + // This represents the word "world" in UTF-16 Little Endian. + data := []byte{0x77, 0x00, 0x6f, 0x00, 0x72, 0x00, 0x6c, 0x00, 0x64, 0x00} + withBOMBytes := utfbom.Prepend(data, utfbom.UTF16LittleEndian) + fmt.Printf("Bytes with UTF-16LE BOM: %#x\n\n", withBOMBytes) + + // The Prepend function is idempotent. + // If a BOM already exists, it will not add another one. + alreadyHasBOM := "\ufeffhello" + idempotentResult := utfbom.Prepend(alreadyHasBOM, utfbom.UTF8) + fmt.Printf("Idempotent result: %q\n", idempotentResult) + fmt.Printf("Bytes are unchanged: %#x\n", idempotentResult) + + // output: + // String with UTF-8 BOM: "\ufeffhello" + // Bytes: 0xefbbbf68656c6c6f + // + // Bytes with UTF-16LE BOM: 0xfffe77006f0072006c006400 + // + // Idempotent result: "\ufeffhello" + // Bytes are unchanged: 0xefbbbf68656c6c6f +} + func ExampleReader() { csvFile := "\uFEFFIndex,Customer Id,First Name\n" + "1,DD37Cf93aecA6Dc,Sheryl" @@ -351,3 +389,101 @@ func TestReader_WrappeeReaderHasTinyPayload_OneByteBuffer(t *testing.T) { be.Err(t, err, io.EOF) be.Equal(t, 0, n) } + +func TestEncoding_Bytes(t *testing.T) { + t.Parallel() + + testCases := []struct { + name string + enc utfbom.Encoding + expected []byte + }{ + { + name: "Unknown", + enc: utfbom.Unknown, + expected: nil, + }, + { + name: "UTF8", + enc: utfbom.UTF8, + expected: []byte{0xef, 0xbb, 0xbf}, + }, + { + name: "UTF16BigEndian", + enc: utfbom.UTF16BigEndian, + expected: []byte{0xfe, 0xff}, + }, + { + name: "UTF16LittleEndian", + enc: utfbom.UTF16LittleEndian, + expected: []byte{0xff, 0xfe}, + }, + { + name: "UTF32BigEndian", + enc: utfbom.UTF32BigEndian, + expected: []byte{0x00, 0x00, 0xfe, 0xff}, + }, + { + name: "UTF32LittleEndian", + enc: utfbom.UTF32LittleEndian, + expected: []byte{0xff, 0xfe, 0x00, 0x00}, + }, + { + name: "InvalidEncoding", + enc: utfbom.Encoding(999), + expected: nil, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + got := tc.enc.Bytes() + be.Equal(t, got, tc.expected) + }) + } +} + +func TestPrepend(t *testing.T) { + t.Parallel() + + t.Run("byte_slice", func(t *testing.T) { + data := []byte("data") + + testCases := []struct { + name string + input []byte + enc utfbom.Encoding + expected []byte + }{ + {"unknown_on_data", data, utfbom.Unknown, data}, + {"unknown_on_empty", []byte{}, utfbom.Unknown, []byte{}}, + {"unknown_on_nil", nil, utfbom.Unknown, nil}, + {"utf8_on_data", data, utfbom.UTF8, append(utf8BOM, data...)}, + {"utf8_on_empty", []byte{}, utfbom.UTF8, utf8BOM}, + {"utf8_on_nil", nil, utfbom.UTF8, utf8BOM}, + {"utf16be_on_data", data, utfbom.UTF16BigEndian, append(utf16BEBOM, data...)}, + {"utf16be_on_empty", []byte{}, utfbom.UTF16BigEndian, utf16BEBOM}, + {"utf16be_on_nil", nil, utfbom.UTF16BigEndian, utf16BEBOM}, + {"utf16le_on_data", data, utfbom.UTF16LittleEndian, append(utf16LEBOM, data...)}, + {"utf16le_on_empty", []byte{}, utfbom.UTF16LittleEndian, utf16LEBOM}, + {"utf16le_on_nil", nil, utfbom.UTF16LittleEndian, utf16LEBOM}, + {"utf32be_on_data", data, utfbom.UTF32BigEndian, append(utf32BEBOM, data...)}, + {"utf32be_on_empty", []byte{}, utfbom.UTF32BigEndian, utf32BEBOM}, + {"utf32be_on_nil", nil, utfbom.UTF32BigEndian, utf32BEBOM}, + {"utf32le_on_data", data, utfbom.UTF32LittleEndian, append(utf32LEBOM, data...)}, + {"utf32le_on_empty", []byte{}, utfbom.UTF32LittleEndian, utf32LEBOM}, + {"utf32le_on_nil", nil, utfbom.UTF32LittleEndian, utf32LEBOM}, + {"idempotent_when_bom_exists", append(utf8BOM, data...), utfbom.UTF8, append(utf8BOM, data...)}, + {"idempotent_when_different_bom_exists", append(utf32LEBOM, data...), utfbom.UTF16BigEndian, append(utf32LEBOM, data...)}, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + t.Parallel() + + got := utfbom.Prepend(tc.input, tc.enc) + be.Equal(t, got, tc.expected) + }) + } + }) +}