Skip to content

Commit 12dbb97

Browse files
authored
Merge pull request #2 from slash3b/improv
fix: better support for reading tiny payloads
2 parents 6d8f172 + 9ad22f8 commit 12dbb97

File tree

2 files changed

+51
-33
lines changed

2 files changed

+51
-33
lines changed

utfbom.go

Lines changed: 19 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -7,22 +7,23 @@ package utfbom
77

88
import (
99
"bufio"
10+
"bytes"
1011
"errors"
1112
"io"
1213
"sync"
1314
)
1415

1516
var (
1617
_ io.Reader = (*Reader)(nil)
17-
utf8BOM = [3]byte{0xef, 0xbb, 0xbf}
18-
utf16BEBOM = [2]byte{0xfe, 0xff}
19-
utf16LEBOM = [2]byte{0xff, 0xfe}
20-
utf32BEBOM = [4]byte{0x00, 0x00, 0xfe, 0xff}
21-
utf32LEBOM = [4]byte{0xff, 0xfe, 0x00, 0x00}
18+
utf8BOM = []byte{0xef, 0xbb, 0xbf}
19+
utf16BEBOM = []byte{0xfe, 0xff}
20+
utf16LEBOM = []byte{0xff, 0xfe}
21+
utf32BEBOM = []byte{0x00, 0x00, 0xfe, 0xff}
22+
utf32LEBOM = []byte{0xff, 0xfe, 0x00, 0x00}
2223
)
2324

2425
// ErrRead helps to trace error origin.
25-
var ErrRead = errors.New("utfbom library unable to detect BOM")
26+
var ErrRead = errors.New("utfbom: I/O error during BOM processing")
2627

2728
// Encoding is a character encoding standard.
2829
type Encoding int
@@ -64,39 +65,31 @@ const (
6465
// - UTF-32 Big Endian (BOM: 0x00 0x00 0xfe 0xff)
6566
// - UTF-32 Little Endian (BOM: 0xff 0xfe 0x00 0x00)
6667
func DetectEncoding[T string | []byte](input T) Encoding {
67-
bytes := []byte(input)
68+
ibs := []byte(input)
6869

69-
if len(bytes) < 2 {
70+
if len(ibs) < 2 {
7071
return Unknown
7172
}
7273

73-
if len(bytes) >= 4 {
74-
if utf32BEBOM[0] == bytes[0] &&
75-
utf32BEBOM[1] == bytes[1] &&
76-
utf32BEBOM[2] == bytes[2] &&
77-
utf32BEBOM[3] == bytes[3] {
74+
if len(ibs) >= 4 {
75+
if bytes.HasPrefix(ibs, utf32BEBOM) {
7876
return UTF32BigEndian
7977
}
8078

81-
if utf32LEBOM[0] == bytes[0] &&
82-
utf32LEBOM[1] == bytes[1] &&
83-
utf32LEBOM[2] == bytes[2] &&
84-
utf32LEBOM[3] == bytes[3] {
79+
if bytes.HasPrefix(ibs, utf32LEBOM) {
8580
return UTF32LittleEndian
8681
}
8782
}
8883

89-
if len(bytes) >= 3 {
90-
if utf8BOM[0] == bytes[0] && utf8BOM[1] == bytes[1] && utf8BOM[2] == bytes[2] {
91-
return UTF8
92-
}
84+
if len(ibs) >= 3 && bytes.HasPrefix(ibs, utf8BOM) {
85+
return UTF8
9386
}
9487

95-
if utf16BEBOM[0] == bytes[0] && utf16BEBOM[1] == bytes[1] {
88+
if bytes.HasPrefix(ibs, utf16BEBOM) {
9689
return UTF16BigEndian
9790
}
9891

99-
if utf16LEBOM[0] == bytes[0] && utf16LEBOM[1] == bytes[1] {
92+
if bytes.HasPrefix(ibs, utf16LEBOM) {
10093
return UTF16LittleEndian
10194
}
10295

@@ -194,7 +187,9 @@ func (r *Reader) Read(buf []byte) (int, error) {
194187

195188
r.once.Do(func() {
196189
bytes, err := r.rd.Peek(maxBOMLen)
197-
if err != nil {
190+
// do not error out in case underlying payload is too small
191+
// still attempt to read fewer than n bytes.
192+
if err != nil && !errors.Is(err, io.EOF) && !errors.Is(err, io.ErrUnexpectedEOF) {
198193
bomErr = errors.Join(ErrRead, err)
199194

200195
return

utfbom_test.go

Lines changed: 32 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -310,21 +310,44 @@ func TestReader_EmptyBuffer(t *testing.T) {
310310
}
311311
}
312312

313-
func TestReader_WrappeeReaderIsTooSmall(t *testing.T) {
313+
// TestReader_WrappeeReaderHasTinyPayload tests that bufio.Reader is able to
314+
// read on the first Read without failing.
315+
func TestReader_WrappeeReaderHasTinyPayload_EnoughBuffer(t *testing.T) {
314316
t.Parallel()
315317

316-
wrappee := strings.NewReader("a")
318+
wrappee := bytes.NewReader([]byte{0xff, 0xfe, 0x01, 0x02, 0x03})
317319
wrapped := utfbom.NewReader(wrappee)
318320

319321
buf := make([]byte, 100)
320322
n, err := wrapped.Read(buf)
321-
be.Equal(t, 0, n)
322-
be.Err(t, err, io.EOF)
323-
be.Err(t, err, utfbom.ErrRead)
323+
be.Equal(t, 3, n)
324+
t.Logf("have read %q", string(buf[:n]))
325+
t.Logf("detected enc: %s", wrapped.Enc)
326+
t.Logf("err: %v", err)
327+
be.Equal(t, buf[:n], []byte{0x01, 0x02, 0x03})
328+
be.Err(t, err, nil)
324329

325-
// you might proceed reading if you want
326330
n, err = wrapped.Read(buf)
327-
be.Err(t, err, nil)
328-
be.Equal(t, 1, n)
329-
be.Equal(t, string(buf[:n]), "a")
331+
t.Logf("second read returns err: %v", err)
332+
be.Err(t, err, io.EOF)
333+
be.Equal(t, 0, n)
334+
}
335+
336+
func TestReader_WrappeeReaderHasTinyPayload_OneByteBuffer(t *testing.T) {
337+
t.Parallel()
338+
339+
wrappee := bytes.NewReader([]byte{0xff, 0xfe, 0x01, 0x02, 0x03})
340+
rd := iotest.OneByteReader(utfbom.NewReader(wrappee))
341+
342+
buf := make([]byte, 1)
343+
for i := range 3 {
344+
n, err := rd.Read(buf)
345+
be.Err(t, err, nil)
346+
be.Equal(t, 1, n)
347+
be.Equal(t, []byte{0x01 + byte(i)}, buf[:n])
348+
}
349+
350+
n, err := rd.Read(buf)
351+
be.Err(t, err, io.EOF)
352+
be.Equal(t, 0, n)
330353
}

0 commit comments

Comments
 (0)