Skip to content

Commit 6af523f

Browse files
authored
Merge pull request #3 from slash3b/prepend
prepend function and tiny perf improvement
2 parents 12dbb97 + 3e79b6c commit 6af523f

File tree

3 files changed

+181
-9
lines changed

3 files changed

+181
-9
lines changed

.golangci.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ linters:
1111
- exhaustive
1212
- ireturn
1313
- wrapcheck
14+
- varnamelen
1415
settings:
1516
lll:
1617
line-length: 160

utfbom.go

Lines changed: 44 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,10 @@ func DetectEncoding[T string | []byte](input T) Encoding {
7171
return Unknown
7272
}
7373

74+
if len(ibs) >= 3 && bytes.HasPrefix(ibs, utf8BOM) {
75+
return UTF8
76+
}
77+
7478
if len(ibs) >= 4 {
7579
if bytes.HasPrefix(ibs, utf32BEBOM) {
7680
return UTF32BigEndian
@@ -81,10 +85,6 @@ func DetectEncoding[T string | []byte](input T) Encoding {
8185
}
8286
}
8387

84-
if len(ibs) >= 3 && bytes.HasPrefix(ibs, utf8BOM) {
85-
return UTF8
86-
}
87-
8888
if bytes.HasPrefix(ibs, utf16BEBOM) {
8989
return UTF16BigEndian
9090
}
@@ -140,17 +140,52 @@ func (e Encoding) Len() int {
140140
}
141141
}
142142

143+
// Bytes returns encoding bytes.
144+
func (e Encoding) Bytes() []byte {
145+
switch e {
146+
default:
147+
return nil
148+
case UTF8:
149+
return utf8BOM
150+
case UTF16BigEndian:
151+
return utf16BEBOM
152+
case UTF16LittleEndian:
153+
return utf16LEBOM
154+
case UTF32BigEndian:
155+
return utf32BEBOM
156+
case UTF32LittleEndian:
157+
return utf32LEBOM
158+
}
159+
}
160+
143161
// Trim removes the BOM prefix from the input `s` based on the encoding `enc`.
144162
// Supports string or []byte inputs and returns the same type without the BOM.
145163
func Trim[T string | []byte](input T) (T, Encoding) {
146-
bytes := []byte(input)
147-
enc := DetectEncoding(bytes)
164+
b := []byte(input)
165+
enc := DetectEncoding(b)
148166

149167
if enc == Unknown {
150168
return input, enc
151169
}
152170

153-
return T(bytes[enc.Len():]), enc
171+
return T(b[enc.Len():]), enc
172+
}
173+
174+
// Prepend adds the corresponding Byte Order Mark (BOM) for a given encoding
175+
// to the beginning of a string or byte slice.
176+
// If the provided encoding is Unknown, the input is returned unmodified.
177+
func Prepend[T string | []byte](input T, enc Encoding) T {
178+
if enc == Unknown {
179+
return input
180+
}
181+
182+
b := []byte(input)
183+
184+
if DetectEncoding(b) != Unknown {
185+
return input
186+
}
187+
188+
return T(append(enc.Bytes(), b...))
154189
}
155190

156191
// Reader implements automatic BOM (Unicode Byte Order Mark) checking and
@@ -186,7 +221,7 @@ func (r *Reader) Read(buf []byte) (int, error) {
186221
var bomErr error
187222

188223
r.once.Do(func() {
189-
bytes, err := r.rd.Peek(maxBOMLen)
224+
b, err := r.rd.Peek(maxBOMLen)
190225
// do not error out in case underlying payload is too small
191226
// still attempt to read fewer than n bytes.
192227
if err != nil && !errors.Is(err, io.EOF) && !errors.Is(err, io.ErrUnexpectedEOF) {
@@ -195,7 +230,7 @@ func (r *Reader) Read(buf []byte) (int, error) {
195230
return
196231
}
197232

198-
r.Enc = DetectEncoding(bytes)
233+
r.Enc = DetectEncoding(b)
199234
if r.Enc != Unknown {
200235
_, err = r.rd.Discard(r.Enc.Len())
201236
if err != nil {

utfbom_test.go

Lines changed: 136 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,14 @@ import (
1414
"github.com/slash3b/utfbom"
1515
)
1616

17+
var (
18+
utf8BOM = []byte{0xef, 0xbb, 0xbf}
19+
utf16BEBOM = []byte{0xfe, 0xff}
20+
utf16LEBOM = []byte{0xff, 0xfe}
21+
utf32BEBOM = []byte{0x00, 0x00, 0xfe, 0xff}
22+
utf32LEBOM = []byte{0xff, 0xfe, 0x00, 0x00}
23+
)
24+
1725
func TestDetectBom(t *testing.T) {
1826
testCases := []struct {
1927
name string
@@ -127,6 +135,36 @@ func ExampleTrim() {
127135
// output bytes:0x68656c6c6f
128136
}
129137

138+
func ExamplePrepend() {
139+
// Prepend a UTF-8 BOM to a simple string.
140+
// The UTF-8 BOM is represented by the rune \ufeff.
141+
withBOM := utfbom.Prepend("hello", utfbom.UTF8)
142+
fmt.Printf("String with UTF-8 BOM: %q\n", withBOM)
143+
fmt.Printf("Bytes: %#x\n\n", withBOM)
144+
145+
// Prepend a UTF-16LE BOM to a byte slice that is also UTF-16LE encoded.
146+
// This represents the word "world" in UTF-16 Little Endian.
147+
data := []byte{0x77, 0x00, 0x6f, 0x00, 0x72, 0x00, 0x6c, 0x00, 0x64, 0x00}
148+
withBOMBytes := utfbom.Prepend(data, utfbom.UTF16LittleEndian)
149+
fmt.Printf("Bytes with UTF-16LE BOM: %#x\n\n", withBOMBytes)
150+
151+
// The Prepend function is idempotent.
152+
// If a BOM already exists, it will not add another one.
153+
alreadyHasBOM := "\ufeffhello"
154+
idempotentResult := utfbom.Prepend(alreadyHasBOM, utfbom.UTF8)
155+
fmt.Printf("Idempotent result: %q\n", idempotentResult)
156+
fmt.Printf("Bytes are unchanged: %#x\n", idempotentResult)
157+
158+
// output:
159+
// String with UTF-8 BOM: "\ufeffhello"
160+
// Bytes: 0xefbbbf68656c6c6f
161+
//
162+
// Bytes with UTF-16LE BOM: 0xfffe77006f0072006c006400
163+
//
164+
// Idempotent result: "\ufeffhello"
165+
// Bytes are unchanged: 0xefbbbf68656c6c6f
166+
}
167+
130168
func ExampleReader() {
131169
csvFile := "\uFEFFIndex,Customer Id,First Name\n" +
132170
"1,DD37Cf93aecA6Dc,Sheryl"
@@ -351,3 +389,101 @@ func TestReader_WrappeeReaderHasTinyPayload_OneByteBuffer(t *testing.T) {
351389
be.Err(t, err, io.EOF)
352390
be.Equal(t, 0, n)
353391
}
392+
393+
func TestEncoding_Bytes(t *testing.T) {
394+
t.Parallel()
395+
396+
testCases := []struct {
397+
name string
398+
enc utfbom.Encoding
399+
expected []byte
400+
}{
401+
{
402+
name: "Unknown",
403+
enc: utfbom.Unknown,
404+
expected: nil,
405+
},
406+
{
407+
name: "UTF8",
408+
enc: utfbom.UTF8,
409+
expected: []byte{0xef, 0xbb, 0xbf},
410+
},
411+
{
412+
name: "UTF16BigEndian",
413+
enc: utfbom.UTF16BigEndian,
414+
expected: []byte{0xfe, 0xff},
415+
},
416+
{
417+
name: "UTF16LittleEndian",
418+
enc: utfbom.UTF16LittleEndian,
419+
expected: []byte{0xff, 0xfe},
420+
},
421+
{
422+
name: "UTF32BigEndian",
423+
enc: utfbom.UTF32BigEndian,
424+
expected: []byte{0x00, 0x00, 0xfe, 0xff},
425+
},
426+
{
427+
name: "UTF32LittleEndian",
428+
enc: utfbom.UTF32LittleEndian,
429+
expected: []byte{0xff, 0xfe, 0x00, 0x00},
430+
},
431+
{
432+
name: "InvalidEncoding",
433+
enc: utfbom.Encoding(999),
434+
expected: nil,
435+
},
436+
}
437+
438+
for _, tc := range testCases {
439+
t.Run(tc.name, func(t *testing.T) {
440+
got := tc.enc.Bytes()
441+
be.Equal(t, got, tc.expected)
442+
})
443+
}
444+
}
445+
446+
func TestPrepend(t *testing.T) {
447+
t.Parallel()
448+
449+
t.Run("byte_slice", func(t *testing.T) {
450+
data := []byte("data")
451+
452+
testCases := []struct {
453+
name string
454+
input []byte
455+
enc utfbom.Encoding
456+
expected []byte
457+
}{
458+
{"unknown_on_data", data, utfbom.Unknown, data},
459+
{"unknown_on_empty", []byte{}, utfbom.Unknown, []byte{}},
460+
{"unknown_on_nil", nil, utfbom.Unknown, nil},
461+
{"utf8_on_data", data, utfbom.UTF8, append(utf8BOM, data...)},
462+
{"utf8_on_empty", []byte{}, utfbom.UTF8, utf8BOM},
463+
{"utf8_on_nil", nil, utfbom.UTF8, utf8BOM},
464+
{"utf16be_on_data", data, utfbom.UTF16BigEndian, append(utf16BEBOM, data...)},
465+
{"utf16be_on_empty", []byte{}, utfbom.UTF16BigEndian, utf16BEBOM},
466+
{"utf16be_on_nil", nil, utfbom.UTF16BigEndian, utf16BEBOM},
467+
{"utf16le_on_data", data, utfbom.UTF16LittleEndian, append(utf16LEBOM, data...)},
468+
{"utf16le_on_empty", []byte{}, utfbom.UTF16LittleEndian, utf16LEBOM},
469+
{"utf16le_on_nil", nil, utfbom.UTF16LittleEndian, utf16LEBOM},
470+
{"utf32be_on_data", data, utfbom.UTF32BigEndian, append(utf32BEBOM, data...)},
471+
{"utf32be_on_empty", []byte{}, utfbom.UTF32BigEndian, utf32BEBOM},
472+
{"utf32be_on_nil", nil, utfbom.UTF32BigEndian, utf32BEBOM},
473+
{"utf32le_on_data", data, utfbom.UTF32LittleEndian, append(utf32LEBOM, data...)},
474+
{"utf32le_on_empty", []byte{}, utfbom.UTF32LittleEndian, utf32LEBOM},
475+
{"utf32le_on_nil", nil, utfbom.UTF32LittleEndian, utf32LEBOM},
476+
{"idempotent_when_bom_exists", append(utf8BOM, data...), utfbom.UTF8, append(utf8BOM, data...)},
477+
{"idempotent_when_different_bom_exists", append(utf32LEBOM, data...), utfbom.UTF16BigEndian, append(utf32LEBOM, data...)},
478+
}
479+
480+
for _, tc := range testCases {
481+
t.Run(tc.name, func(t *testing.T) {
482+
t.Parallel()
483+
484+
got := utfbom.Prepend(tc.input, tc.enc)
485+
be.Equal(t, got, tc.expected)
486+
})
487+
}
488+
})
489+
}

0 commit comments

Comments
 (0)