-
Notifications
You must be signed in to change notification settings - Fork 1k
Labels
Description
# Minimal reproducible example
library(data.table)
# create an rbindlist argument containing names that need conversion to UTF-8
iconv(c('føø', 'bär', 'báz'), to = 'latin1') |>
lapply(\(n) setNames(list(42), n)) |>
_[rep(1:3, length.out = 100)] -> foo
# make sure that the UTF-8 originals don't stay in the CHARSXP cache
gc(full = TRUE)
gctorture2(1, 1, TRUE)
rbindlist(foo, fill = TRUE) -> dt==27978==ERROR: AddressSanitizer: heap-buffer-overflow on address 0x602000026b0c at pc 0x7f8cb292f89f bp 0x7ffd85e47450 sp 0x7ffd85e47448
READ of size 4 at 0x602000026b0c thread T0
#0 0x7f8cb292f89e in rbindlist /tmp/RtmpmFwjdU/R.INSTALL6b6a21c82e15/data.table/src/rbindlist.c:119
0x602000026b0c is located 4 bytes to the left of 12-byte region [0x602000026b10,0x602000026b1c)
allocated by thread T0 here:
#0 0x7f8cbdab83b7 in __interceptor_calloc ../../../../src/libsanitizer/asan/asan_malloc_linux.cpp:77
#1 0x7f8cb292e1e2 in rbindlist /tmp/RtmpmFwjdU/R.INSTALL6b6a21c82e15/data.table/src/rbindlist.c:98
The previously marked CHARSXP value gets garbage-collected, and the one newly produced by the second call to ENC2UTF8 doesn't have any marks:
Lines 88 to 119 in 4152c6c
| SEXP s = ENC2UTF8(cnp[j]); // convert different encodings for use.names #5452 | |
| if (TRUELENGTH(s)<0) continue; // seen this name before | |
| if (TRUELENGTH(s)>0) savetl(s); | |
| uniq[nuniq++] = s; | |
| SET_TRUELENGTH(s,-nuniq); | |
| } | |
| } | |
| if (nuniq>0) uniq = realloc(uniq, nuniq*sizeof(SEXP)); // shrink to only what we need to release the spare | |
| // now count the dups (if any) and how they're distributed across the items | |
| int *counts = (int *)calloc(nuniq, sizeof(int)); // counts of names for each colnames | |
| int *maxdup = (int *)calloc(nuniq, sizeof(int)); // the most number of dups for any name within one colname vector | |
| if (!counts || !maxdup) { | |
| // # nocov start | |
| for (int i=0; i<nuniq; ++i) SET_TRUELENGTH(uniq[i], 0); | |
| free(uniq); free(counts); free(maxdup); | |
| savetl_end(); | |
| error(_("Failed to allocate nuniq=%d items working memory in rbindlist.c"), nuniq); | |
| // # nocov end | |
| } | |
| // second pass - count duplicates | |
| for (int i=0; i<LENGTH(l); i++) { | |
| SEXP li = VECTOR_ELT(l, i); | |
| int thisncol=length(li); | |
| if (thisncol==0) continue; | |
| const SEXP cn = getAttrib(li, R_NamesSymbol); | |
| if (!length(cn)) continue; | |
| const SEXP *cnp = STRING_PTR_RO(cn); | |
| memset(counts, 0, nuniq*sizeof(int)); | |
| for (int j=0; j<thisncol; j++) { | |
| SEXP s = ENC2UTF8(cnp[j]); // convert different encodings for use.names #5452 | |
| counts[ -TRUELENGTH(s)-1 ]++; |
There may be other suspect uses where ENC2UTF8(...) values aren't protected. This surfaced while looking for a way to adapt the memory management in rbindlist to a potentially growing hash table.
# Output of sessionInfo()
R Under development (unstable) (2025-12-06 r89118)
Platform: x86_64-pc-linux-gnu
Running under: Devuan GNU/Linux 5 (daedalus)
Matrix products: default
BLAS: REDACTED/lib/libRblas.so
LAPACK: REDACTED/lib/libRlapack.so; LAPACK version 3.12.1
locale:
[1] LC_CTYPE=C.UTF-8 LC_NUMERIC=C LC_TIME=C.UTF-8 LC_COLLATE=C.UTF-8 LC_MONETARY=C.UTF-8 LC_MESSAGES=C.UTF-8 LC_PAPER=C.UTF-8
[8] LC_NAME=C LC_ADDRESS=C LC_TELEPHONE=C LC_MEASUREMENT=C.UTF-8 LC_IDENTIFICATION=C
time zone: Europe/Moscow
tzcode source: system (glibc)
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] data.table_1.17.8
loaded via a namespace (and not attached):
[1] compiler_4.6.0 grid_4.6.0 lattice_0.22-7
MichaelChirico