Skip to content

Commit aa83a6f

Browse files
authored
Merge pull request #5 from slash3b/fixes
misc fixes
2 parents ff673bc + a0170f7 commit aa83a6f

File tree

2 files changed

+120
-30
lines changed

2 files changed

+120
-30
lines changed

utfbom.go

Lines changed: 24 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -13,14 +13,7 @@ import (
1313
"sync"
1414
)
1515

16-
var (
17-
_ io.Reader = (*Reader)(nil)
18-
utf8BOM = []byte{0xef, 0xbb, 0xbf}
19-
utf16BEBOM = []byte{0xfe, 0xff}
20-
utf16LEBOM = []byte{0xff, 0xfe}
21-
utf32BEBOM = []byte{0x00, 0x00, 0xfe, 0xff}
22-
utf32LEBOM = []byte{0xff, 0xfe, 0x00, 0x00}
23-
)
16+
var _ io.Reader = (*Reader)(nil)
2417

2518
// ErrRead helps to trace error origin.
2619
var ErrRead = errors.New("utfbom: I/O error during BOM processing")
@@ -64,32 +57,32 @@ const (
6457
// - UTF-16 Little Endian (BOM: 0xff 0xfe)
6558
// - UTF-32 Big Endian (BOM: 0x00 0x00 0xfe 0xff)
6659
// - UTF-32 Little Endian (BOM: 0xff 0xfe 0x00 0x00)
67-
func DetectEncoding[T string | []byte](input T) Encoding {
68-
ibs := []byte(input)
60+
func DetectEncoding[T ~string | ~[]byte](input T) Encoding {
61+
b := []byte(input)
6962

70-
if len(ibs) < 2 {
63+
if len(b) < 2 {
7164
return Unknown
7265
}
7366

74-
if len(ibs) >= 3 && bytes.HasPrefix(ibs, utf8BOM) {
67+
if len(b) >= 3 && bytes.HasPrefix(b, []byte{0xef, 0xbb, 0xbf}) {
7568
return UTF8
7669
}
7770

78-
if len(ibs) >= 4 {
79-
if bytes.HasPrefix(ibs, utf32BEBOM) {
71+
if len(b) >= 4 {
72+
if bytes.HasPrefix(b, []byte{0x00, 0x00, 0xfe, 0xff}) {
8073
return UTF32BigEndian
8174
}
8275

83-
if bytes.HasPrefix(ibs, utf32LEBOM) {
76+
if bytes.HasPrefix(b, []byte{0xff, 0xfe, 0x00, 0x00}) {
8477
return UTF32LittleEndian
8578
}
8679
}
8780

88-
if bytes.HasPrefix(ibs, utf16BEBOM) {
81+
if bytes.HasPrefix(b, []byte{0xfe, 0xff}) {
8982
return UTF16BigEndian
9083
}
9184

92-
if bytes.HasPrefix(ibs, utf16LEBOM) {
85+
if bytes.HasPrefix(b, []byte{0xff, 0xfe}) {
9386
return UTF16LittleEndian
9487
}
9588

@@ -108,7 +101,7 @@ func (e Encoding) AnyOf(es ...Encoding) bool {
108101
return false
109102
}
110103

111-
// Strings returns human-readable name of encoding.
104+
// String returns the human-readable name of the encoding.
112105
func (e Encoding) String() string {
113106
switch e {
114107
case UTF8:
@@ -146,21 +139,21 @@ func (e Encoding) Bytes() []byte {
146139
default:
147140
return nil
148141
case UTF8:
149-
return utf8BOM
142+
return []byte{0xef, 0xbb, 0xbf}
150143
case UTF16BigEndian:
151-
return utf16BEBOM
144+
return []byte{0xfe, 0xff}
152145
case UTF16LittleEndian:
153-
return utf16LEBOM
146+
return []byte{0xff, 0xfe}
154147
case UTF32BigEndian:
155-
return utf32BEBOM
148+
return []byte{0x00, 0x00, 0xfe, 0xff}
156149
case UTF32LittleEndian:
157-
return utf32LEBOM
150+
return []byte{0xff, 0xfe, 0x00, 0x00}
158151
}
159152
}
160153

161-
// Trim removes the BOM prefix from the input `s` based on the encoding `enc`.
154+
// Trim removes the BOM prefix from the input.
162155
// Supports string or []byte inputs and returns the same type without the BOM.
163-
func Trim[T string | []byte](input T) (T, Encoding) {
156+
func Trim[T ~string | ~[]byte](input T) (T, Encoding) {
164157
b := []byte(input)
165158
enc := DetectEncoding(b)
166159

@@ -174,7 +167,7 @@ func Trim[T string | []byte](input T) (T, Encoding) {
174167
// Prepend adds the corresponding Byte Order Mark (BOM) for a given encoding
175168
// to the beginning of a string or byte slice.
176169
// If the provided encoding is Unknown, the input is returned unmodified.
177-
func Prepend[T string | []byte](input T, enc Encoding) T {
170+
func Prepend[T ~string | ~[]byte](input T, enc Encoding) T {
178171
if enc == Unknown {
179172
return input
180173
}
@@ -190,6 +183,8 @@ func Prepend[T string | []byte](input T, enc Encoding) T {
190183

191184
// Reader implements automatic BOM (Unicode Byte Order Mark) checking and
192185
// removing as necessary for an io.Reader object.
186+
//
187+
// Reader is not safe for concurrent use.
193188
type Reader struct {
194189
rd *bufio.Reader
195190
once sync.Once
@@ -198,6 +193,7 @@ type Reader struct {
198193
}
199194

200195
// NewReader wraps an incoming reader.
196+
// Passing a nil reader will cause a panic on the first Read call.
201197
func NewReader(rd io.Reader) *Reader {
202198
return &Reader{
203199
rd: bufio.NewReader(rd),
@@ -207,10 +203,8 @@ func NewReader(rd io.Reader) *Reader {
207203
}
208204

209205
// Read implements the io.Reader interface.
210-
// On the first read call, it reads from the underlying Reader, detects and removes any Byte Order Mark (BOM).
211-
// Subsequent calls delegate directly to the underlying Reader without BOM handling.
212-
// Read is only safe for concurrent use during the first call due to sync.Once; after that, thread-safety
213-
// depends on the underlying Reader. It is best to assume unsafe concurrent use.
206+
// On the first call, it detects and removes any Byte Order Mark (BOM).
207+
// Subsequent calls delegate directly to the underlying Reader.
214208
func (r *Reader) Read(buf []byte) (int, error) {
215209
const maxBOMLen = 4
216210

utfbom_test.go

Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -443,6 +443,34 @@ func TestEncoding_Bytes(t *testing.T) {
443443
}
444444
}
445445

446+
// TestBytes_NoAliasing checks that bytes returned by Bytes() are immutable.
447+
func TestBytes_NoAliasing(t *testing.T) {
448+
t.Parallel()
449+
450+
encodings := []utfbom.Encoding{
451+
utfbom.UTF8,
452+
utfbom.UTF16BigEndian,
453+
utfbom.UTF16LittleEndian,
454+
utfbom.UTF32BigEndian,
455+
utfbom.UTF32LittleEndian,
456+
}
457+
458+
for _, enc := range encodings {
459+
t.Run(enc.String(), func(t *testing.T) {
460+
t.Parallel()
461+
462+
original := enc.Bytes()
463+
originalCopy := make([]byte, len(original))
464+
copy(originalCopy, original)
465+
466+
original[0] = 0x00
467+
468+
fresh := enc.Bytes()
469+
be.Equal(t, fresh, originalCopy)
470+
})
471+
}
472+
}
473+
446474
func TestPrepend(t *testing.T) {
447475
t.Parallel()
448476

@@ -487,3 +515,71 @@ func TestPrepend(t *testing.T) {
487515
}
488516
})
489517
}
518+
519+
type CustomString string
520+
521+
type CustomBytes []byte
522+
523+
func TestDetectEncoding_TypeAliases(t *testing.T) {
524+
t.Parallel()
525+
526+
t.Run("custom_string", func(t *testing.T) {
527+
input := CustomString("\ufeffhello")
528+
enc := utfbom.DetectEncoding(input)
529+
be.Equal(t, enc, utfbom.UTF8)
530+
})
531+
532+
t.Run("custom_bytes", func(t *testing.T) {
533+
input := CustomBytes([]byte{0xfe, 0xff, 'h', 'i'})
534+
enc := utfbom.DetectEncoding(input)
535+
be.Equal(t, enc, utfbom.UTF16BigEndian)
536+
})
537+
}
538+
539+
func TestTrim_TypeAliases(t *testing.T) {
540+
t.Parallel()
541+
542+
t.Run("custom_string", func(t *testing.T) {
543+
input := CustomString("\ufeffhello")
544+
out, enc := utfbom.Trim(input)
545+
be.Equal(t, enc, utfbom.UTF8)
546+
be.Equal(t, out, CustomString("hello"))
547+
})
548+
549+
t.Run("custom_bytes", func(t *testing.T) {
550+
input := CustomBytes([]byte{0xfe, 0xff, 'h', 'i'})
551+
out, enc := utfbom.Trim(input)
552+
be.Equal(t, enc, utfbom.UTF16BigEndian)
553+
be.Equal(t, out, CustomBytes([]byte{'h', 'i'}))
554+
})
555+
}
556+
557+
func TestPrepend_TypeAliases(t *testing.T) {
558+
t.Parallel()
559+
560+
t.Run("custom_string", func(t *testing.T) {
561+
input := CustomString("hello")
562+
out := utfbom.Prepend(input, utfbom.UTF8)
563+
be.Equal(t, out, CustomString("\ufeffhello"))
564+
})
565+
566+
t.Run("custom_bytes", func(t *testing.T) {
567+
input := CustomBytes([]byte{'h', 'i'})
568+
out := utfbom.Prepend(input, utfbom.UTF16BigEndian)
569+
be.Equal(t, out, CustomBytes([]byte{0xfe, 0xff, 'h', 'i'}))
570+
})
571+
}
572+
573+
func TestNewReader_NilPanics(t *testing.T) {
574+
t.Parallel()
575+
576+
rd := utfbom.NewReader(nil)
577+
578+
defer func() {
579+
r := recover()
580+
be.True(t, r != nil)
581+
}()
582+
583+
buf := make([]byte, 10)
584+
_, _ = rd.Read(buf)
585+
}

0 commit comments

Comments
 (0)