Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .golangci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ linters:
- exhaustive
- ireturn
- wrapcheck
- varnamelen
settings:
lll:
line-length: 160
Expand Down
53 changes: 44 additions & 9 deletions utfbom.go
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,10 @@ func DetectEncoding[T string | []byte](input T) Encoding {
return Unknown
}

if len(ibs) >= 3 && bytes.HasPrefix(ibs, utf8BOM) {
return UTF8
}

if len(ibs) >= 4 {
if bytes.HasPrefix(ibs, utf32BEBOM) {
return UTF32BigEndian
Expand All @@ -81,10 +85,6 @@ func DetectEncoding[T string | []byte](input T) Encoding {
}
}

if len(ibs) >= 3 && bytes.HasPrefix(ibs, utf8BOM) {
return UTF8
}

if bytes.HasPrefix(ibs, utf16BEBOM) {
return UTF16BigEndian
}
Expand Down Expand Up @@ -140,17 +140,52 @@ func (e Encoding) Len() int {
}
}

// Bytes returns encoding bytes.
func (e Encoding) Bytes() []byte {
switch e {
default:
return nil
case UTF8:
return utf8BOM
case UTF16BigEndian:
return utf16BEBOM
case UTF16LittleEndian:
return utf16LEBOM
case UTF32BigEndian:
return utf32BEBOM
case UTF32LittleEndian:
return utf32LEBOM
}
}

// Trim removes the BOM prefix from the input `s` based on the encoding `enc`.
// Supports string or []byte inputs and returns the same type without the BOM.
func Trim[T string | []byte](input T) (T, Encoding) {
bytes := []byte(input)
enc := DetectEncoding(bytes)
b := []byte(input)
enc := DetectEncoding(b)

if enc == Unknown {
return input, enc
}

return T(bytes[enc.Len():]), enc
return T(b[enc.Len():]), enc
}

// Prepend adds the corresponding Byte Order Mark (BOM) for a given encoding
// to the beginning of a string or byte slice.
// If the provided encoding is Unknown, the input is returned unmodified.
func Prepend[T string | []byte](input T, enc Encoding) T {
if enc == Unknown {
return input
}

b := []byte(input)

if DetectEncoding(b) != Unknown {
return input
}

return T(append(enc.Bytes(), b...))
}

// Reader implements automatic BOM (Unicode Byte Order Mark) checking and
Expand Down Expand Up @@ -186,7 +221,7 @@ func (r *Reader) Read(buf []byte) (int, error) {
var bomErr error

r.once.Do(func() {
bytes, err := r.rd.Peek(maxBOMLen)
b, err := r.rd.Peek(maxBOMLen)
// do not error out in case underlying payload is too small
// still attempt to read fewer than n bytes.
if err != nil && !errors.Is(err, io.EOF) && !errors.Is(err, io.ErrUnexpectedEOF) {
Expand All @@ -195,7 +230,7 @@ func (r *Reader) Read(buf []byte) (int, error) {
return
}

r.Enc = DetectEncoding(bytes)
r.Enc = DetectEncoding(b)
if r.Enc != Unknown {
_, err = r.rd.Discard(r.Enc.Len())
if err != nil {
Expand Down
136 changes: 136 additions & 0 deletions utfbom_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,14 @@ import (
"github.com/slash3b/utfbom"
)

var (
utf8BOM = []byte{0xef, 0xbb, 0xbf}
utf16BEBOM = []byte{0xfe, 0xff}
utf16LEBOM = []byte{0xff, 0xfe}
utf32BEBOM = []byte{0x00, 0x00, 0xfe, 0xff}
utf32LEBOM = []byte{0xff, 0xfe, 0x00, 0x00}
)

func TestDetectBom(t *testing.T) {
testCases := []struct {
name string
Expand Down Expand Up @@ -127,6 +135,36 @@ func ExampleTrim() {
// output bytes:0x68656c6c6f
}

func ExamplePrepend() {
// Prepend a UTF-8 BOM to a simple string.
// The UTF-8 BOM is represented by the rune \ufeff.
withBOM := utfbom.Prepend("hello", utfbom.UTF8)
fmt.Printf("String with UTF-8 BOM: %q\n", withBOM)
fmt.Printf("Bytes: %#x\n\n", withBOM)

// Prepend a UTF-16LE BOM to a byte slice that is also UTF-16LE encoded.
// This represents the word "world" in UTF-16 Little Endian.
data := []byte{0x77, 0x00, 0x6f, 0x00, 0x72, 0x00, 0x6c, 0x00, 0x64, 0x00}
withBOMBytes := utfbom.Prepend(data, utfbom.UTF16LittleEndian)
fmt.Printf("Bytes with UTF-16LE BOM: %#x\n\n", withBOMBytes)

// The Prepend function is idempotent.
// If a BOM already exists, it will not add another one.
alreadyHasBOM := "\ufeffhello"
idempotentResult := utfbom.Prepend(alreadyHasBOM, utfbom.UTF8)
fmt.Printf("Idempotent result: %q\n", idempotentResult)
fmt.Printf("Bytes are unchanged: %#x\n", idempotentResult)

// output:
// String with UTF-8 BOM: "\ufeffhello"
// Bytes: 0xefbbbf68656c6c6f
//
// Bytes with UTF-16LE BOM: 0xfffe77006f0072006c006400
//
// Idempotent result: "\ufeffhello"
// Bytes are unchanged: 0xefbbbf68656c6c6f
}

func ExampleReader() {
csvFile := "\uFEFFIndex,Customer Id,First Name\n" +
"1,DD37Cf93aecA6Dc,Sheryl"
Expand Down Expand Up @@ -351,3 +389,101 @@ func TestReader_WrappeeReaderHasTinyPayload_OneByteBuffer(t *testing.T) {
be.Err(t, err, io.EOF)
be.Equal(t, 0, n)
}

func TestEncoding_Bytes(t *testing.T) {
t.Parallel()

testCases := []struct {
name string
enc utfbom.Encoding
expected []byte
}{
{
name: "Unknown",
enc: utfbom.Unknown,
expected: nil,
},
{
name: "UTF8",
enc: utfbom.UTF8,
expected: []byte{0xef, 0xbb, 0xbf},
},
{
name: "UTF16BigEndian",
enc: utfbom.UTF16BigEndian,
expected: []byte{0xfe, 0xff},
},
{
name: "UTF16LittleEndian",
enc: utfbom.UTF16LittleEndian,
expected: []byte{0xff, 0xfe},
},
{
name: "UTF32BigEndian",
enc: utfbom.UTF32BigEndian,
expected: []byte{0x00, 0x00, 0xfe, 0xff},
},
{
name: "UTF32LittleEndian",
enc: utfbom.UTF32LittleEndian,
expected: []byte{0xff, 0xfe, 0x00, 0x00},
},
{
name: "InvalidEncoding",
enc: utfbom.Encoding(999),
expected: nil,
},
}

for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
got := tc.enc.Bytes()
be.Equal(t, got, tc.expected)
})
}
}

func TestPrepend(t *testing.T) {
t.Parallel()

t.Run("byte_slice", func(t *testing.T) {
data := []byte("data")

testCases := []struct {
name string
input []byte
enc utfbom.Encoding
expected []byte
}{
{"unknown_on_data", data, utfbom.Unknown, data},
{"unknown_on_empty", []byte{}, utfbom.Unknown, []byte{}},
{"unknown_on_nil", nil, utfbom.Unknown, nil},
{"utf8_on_data", data, utfbom.UTF8, append(utf8BOM, data...)},
{"utf8_on_empty", []byte{}, utfbom.UTF8, utf8BOM},
{"utf8_on_nil", nil, utfbom.UTF8, utf8BOM},
{"utf16be_on_data", data, utfbom.UTF16BigEndian, append(utf16BEBOM, data...)},
{"utf16be_on_empty", []byte{}, utfbom.UTF16BigEndian, utf16BEBOM},
{"utf16be_on_nil", nil, utfbom.UTF16BigEndian, utf16BEBOM},
{"utf16le_on_data", data, utfbom.UTF16LittleEndian, append(utf16LEBOM, data...)},
{"utf16le_on_empty", []byte{}, utfbom.UTF16LittleEndian, utf16LEBOM},
{"utf16le_on_nil", nil, utfbom.UTF16LittleEndian, utf16LEBOM},
{"utf32be_on_data", data, utfbom.UTF32BigEndian, append(utf32BEBOM, data...)},
{"utf32be_on_empty", []byte{}, utfbom.UTF32BigEndian, utf32BEBOM},
{"utf32be_on_nil", nil, utfbom.UTF32BigEndian, utf32BEBOM},
{"utf32le_on_data", data, utfbom.UTF32LittleEndian, append(utf32LEBOM, data...)},
{"utf32le_on_empty", []byte{}, utfbom.UTF32LittleEndian, utf32LEBOM},
{"utf32le_on_nil", nil, utfbom.UTF32LittleEndian, utf32LEBOM},
{"idempotent_when_bom_exists", append(utf8BOM, data...), utfbom.UTF8, append(utf8BOM, data...)},
{"idempotent_when_different_bom_exists", append(utf32LEBOM, data...), utfbom.UTF16BigEndian, append(utf32LEBOM, data...)},
}

for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
t.Parallel()

got := utfbom.Prepend(tc.input, tc.enc)
be.Equal(t, got, tc.expected)
})
}
})
}