Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 6 additions & 22 deletions R/frollapply.R
Original file line number Diff line number Diff line change
Expand Up @@ -297,38 +297,22 @@ frollapply = function(X, N, FUN, ..., by.column=TRUE, fill=NA, align=c("right","
tight = function(i, dest, src, n) FUN(.Call(CmemcpyDT, dest, src, i, n), ...)
}
} else {
#has.growable = base::getRversion() >= "3.4.0"
## this is now always TRUE
## we keep this branch, it may be useful when getting rid of SET_GROWABLE_BIT and SETLENGTH #6180
has.growable = TRUE
cpy = if (has.growable) function(x) .Call(Csetgrowable, copy(x)) else copy
cpy = function(x) .Call(CcopyAsGrowable, x, by.column)
ansMask = function(len, n) {
mask = seq_len(len) >= n
mask[is.na(mask)] = FALSE ## test 6010.206
mask
}
if (by.column) {
allocWindow = function(x, n) x[seq_len(max(n, na.rm=TRUE))]
if (has.growable) {
tight = function(i, dest, src, n) FUN(.Call(CmemcpyVectoradaptive, dest, src, i, n), ...) # CmemcpyVectoradaptive handles k[i]==0
} else {
tight = function(i, dest, src, n) {stopf("internal error: has.growable should be TRUE, implement support for n==0"); FUN(src[(i-n[i]+1L):i], ...)} # nocov
}
allocWindow = function(x, n) cpy(x[seq_len(max(n, na.rm=TRUE))])
tight = function(i, dest, src, n) FUN(.Call(CmemcpyVectoradaptive, dest, src, i, n), ...) # CmemcpyVectoradaptive handles k[i]==0
} else {
if (!list.df) {
allocWindow = function(x, n) x[seq_len(max(n, na.rm=TRUE)), , drop=FALSE]
} else {
allocWindow = function(x, n) lapply(x, `[`, seq_len(max(n)))
}
if (has.growable) {
tight = function(i, dest, src, n) FUN(.Call(CmemcpyDTadaptive, dest, src, i, n), ...) # CmemcpyDTadaptive handles k[i]==0
allocWindow = function(x, n) cpy(x[seq_len(max(n, na.rm=TRUE)), , drop=FALSE])
} else {
if (!list.df) { # nocov
tight = function(i, dest, src, n) {stopf("internal error: has.growable should be TRUE, implement support for n==0"); FUN(src[(i-n[i]+1L):i, , drop=FALSE], ...)} # nocov
} else {
tight = function(i, dest, src, n) {stopf("internal error: has.growable should be TRUE, implement support for n==0"); FUN(lapply(src, `[`, (i-n[i]+1L):i), ...)} # nocov
}
allocWindow = function(x, n) cpy(lapply(x, `[`, seq_len(max(n))))
}
tight = function(i, dest, src, n) FUN(.Call(CmemcpyDTadaptive, dest, src, i, n), ...) # CmemcpyDTadaptive handles k[i]==0
}
}
## prepare templates for errors and warnings
Expand Down
2 changes: 1 addition & 1 deletion inst/tests/tests.Rraw
Original file line number Diff line number Diff line change
Expand Up @@ -19453,7 +19453,7 @@ test(2290.4, DT[, `:=`(a = 2, c := 3)], error="It looks like you re-used `:=` in
df = data.frame(a=1:3)
setDT(df)
attr(df, "att") = 1
test(2291.1, set(df, NULL, "new", "new"), error="attributes .* have been reassigned")
test(2291.1, set(df, NULL, "new", "new"), error="either been loaded from disk.*or constructed manually.*Please run setDT.*setalloccol.*on it first")

# ns-qualified bysub error, #6493
DT = data.table(a = 1)
Expand Down
101 changes: 19 additions & 82 deletions src/assign.c
Original file line number Diff line number Diff line change
@@ -1,28 +1,5 @@
#include "data.table.h"

static void finalizer(SEXP p)
{
SEXP x;
R_len_t n, l, tl;
if(!R_ExternalPtrAddr(p)) internal_error(__func__, "didn't receive an ExternalPtr"); // # nocov
p = R_ExternalPtrTag(p);
if (!isString(p)) internal_error(__func__, "ExternalPtr doesn't see names in tag"); // # nocov
l = LENGTH(p);
tl = TRUELENGTH(p);
if (l<0 || tl<l) internal_error(__func__, "l=%d, tl=%d", l, tl); // # nocov
n = tl-l;
if (n==0) {
// gc's ReleaseLargeFreeVectors() will have reduced R_LargeVallocSize by the correct amount
// already, so nothing to do (but almost never the case).
return;
}
x = PROTECT(allocVector(INTSXP, 50)); // 50 so it's big enough to be on LargeVector heap. See NodeClassSize in memory.c:allocVector
// INTSXP rather than VECSXP so that GC doesn't inspect contents after LENGTH (thanks to Karl Miller, Jul 2015)
SETLENGTH(x,50+n*2*sizeof(void *)/4); // 1*n for the names, 1*n for the VECSXP itself (both are over allocated).
UNPROTECT(1);
return;
}

void setselfref(SEXP x) {
if(!INHERITS(x, char_datatable)) return; // #5286
SEXP p;
Expand All @@ -38,7 +15,6 @@ void setselfref(SEXP x) {
R_NilValue
))
));
R_RegisterCFinalizerEx(p, finalizer, FALSE);
UNPROTECT(2);

/*
Expand All @@ -64,39 +40,8 @@ void setselfref(SEXP x) {
*/
}

/* There are two reasons the finalizer doesn't restore the LENGTH to TRUELENGTH. i) The finalizer
happens too late after GC has already released the memory, and ii) copies by base R (e.g.
[<- in write.table just before test 894) allocate at length LENGTH but copy the TRUELENGTH over.
If the finalizer sets LENGTH to TRUELENGTH, that's a fail as it wasn't really allocated at
TRUELENGTH when R did the copy.
Karl Miller suggested an ENVSXP so that restoring LENGTH in finalizer should work. This is the
closest I got to getting it to pass all tests :

SEXP env = PROTECT(allocSExp(ENVSXP));
defineVar(SelfRefSymbol, x, env);
defineVar(R_NamesSymbol, getAttrib(x, R_NamesSymbol), env);
setAttrib(x, SelfRefSymbol, p = R_MakeExternalPtr(
R_NilValue, // for identical() to return TRUE. identical() doesn't look at tag and prot
R_NilValue, //getAttrib(x, R_NamesSymbol), // to detect if names has been replaced and its tl lost, e.g. setattr(DT,"names",...)
PROTECT( // needed when --enable-strict-barrier it seems, iiuc. TO DO: test under that flag and remove if not needed.
env // wrap x in env to avoid an infinite loop in object.size() if prot=x were here
)
));
R_RegisterCFinalizerEx(p, finalizer, FALSE);
UNPROTECT(2);

Then in finalizer:
SETLENGTH(names, tl)
SETLENGTH(dt, tl)

and that finalizer indeed now happens before the GC releases memory (thanks to the env wrapper).

However, we still have problem (ii) above and it didn't pass tests involving base R copies.

We really need R itself to start setting TRUELENGTH to be the allocated length and then
for GC to release TRUELENGTH not LENGTH. Would really tidy this up.

Moved out of ?setkey Details section in 1.12.2 (Mar 2019). Revisit this w.r.t. to recent versions of R.
/*
Moved out of ?setkey Details section in 1.12.2 (Mar 2019). Revisit this w.r.t. to recent versions of R.
The problem (for \code{data.table}) with the copy by \code{key<-} (other than
being slower) is that \R doesn't maintain the over-allocated truelength, but it
looks as though it has. Adding a column by reference using \code{:=} after a
Expand Down Expand Up @@ -126,15 +71,9 @@ static int _selfrefok(SEXP x, Rboolean checkNames, Rboolean verbose) {
tag = R_ExternalPtrTag(v);
if (!(isNull(tag) || isString(tag))) internal_error(__func__, ".internal.selfref tag is neither NULL nor a character vector"); // # nocov
names = getAttrib(x, R_NamesSymbol);
if (names!=tag && isString(names) && !ALTREP(names)) // !ALTREP for #4734
SET_TRUELENGTH(names, LENGTH(names));
// R copied this vector not data.table; it's not actually over-allocated. It looks over-allocated
// because R copies the original vector's tl over despite allocating length.
prot = R_ExternalPtrProtected(v);
if (TYPEOF(prot) != EXTPTRSXP) // Very rare. Was error(_(".internal.selfref prot is not itself an extptr")).
return 0; // # nocov ; see http://stackoverflow.com/questions/15342227/getting-a-random-internal-selfref-error-in-data-table-for-r
if (x!=R_ExternalPtrAddr(prot) && !ALTREP(x))
SET_TRUELENGTH(x, LENGTH(x)); // R copied this vector not data.table, it's not actually over-allocated
return checkNames ? names==tag : x==R_ExternalPtrAddr(prot);
}

Expand All @@ -151,7 +90,7 @@ static SEXP shallow(SEXP dt, SEXP cols, R_len_t n)
// called from alloccol where n is checked carefully, or from shallow() at R level
// where n is set to truelength (i.e. a shallow copy only with no size change)
int protecti=0;
SEXP newdt = PROTECT(allocVector(VECSXP, n)); protecti++; // to do, use growVector here?
SEXP newdt = PROTECT(R_allocResizableVector(VECSXP, n)); protecti++; // to do, use growVector here?
SHALLOW_DUPLICATE_ATTRIB(newdt, dt);

// TO DO: keepattr() would be faster, but can't because shallow isn't merely a shallow copy. It
Expand All @@ -169,7 +108,7 @@ static SEXP shallow(SEXP dt, SEXP cols, R_len_t n)
setAttrib(newdt, sym_sorted, duplicate(sorted));

SEXP names = PROTECT(getAttrib(dt, R_NamesSymbol)); protecti++;
SEXP newnames = PROTECT(allocVector(STRSXP, n)); protecti++;
SEXP newnames = PROTECT(R_allocResizableVector(STRSXP, n)); protecti++;
const int l = isNull(cols) ? LENGTH(dt) : length(cols);
if (isNull(cols)) {
for (int i=0; i<l; ++i) SET_VECTOR_ELT(newdt, i, VECTOR_ELT(dt,i));
Expand All @@ -186,13 +125,9 @@ static SEXP shallow(SEXP dt, SEXP cols, R_len_t n)
for (int i=0; i<l; ++i) SET_STRING_ELT( newnames, i, STRING_ELT(names,INTEGER(cols)[i]-1) );
}
}
R_resizeVector(newdt,l);
R_resizeVector(newnames,l);
setAttrib(newdt, R_NamesSymbol, newnames);
// setAttrib appears to change length and truelength, so need to do that first _then_ SET next,
// otherwise (if the SET were were first) the 100 tl is assigned to length.
SETLENGTH(newnames,l);
SET_TRUELENGTH(newnames,n);
SETLENGTH(newdt,l);
SET_TRUELENGTH(newdt,n);
setselfref(newdt);
UNPROTECT(protecti);
return(newdt);
Expand Down Expand Up @@ -260,7 +195,7 @@ SEXP alloccol(SEXP dt, R_len_t n, Rboolean verbose)
// if (TRUELENGTH(getAttrib(dt,R_NamesSymbol))!=tl)
// internal_error(__func__, "tl of dt passes checks, but tl of names (%d) != tl of dt (%d)", tl, TRUELENGTH(getAttrib(dt,R_NamesSymbol))); // # nocov

tl = TRUELENGTH(dt);
tl = R_maxLength(dt);
// R <= 2.13.2 and we didn't catch uninitialized tl somehow
if (tl<0) internal_error(__func__, "tl of class is marked but tl<0"); // # nocov
if (tl>0 && tl<l) internal_error(__func__, "tl (%d) < l (%d) but tl of class is marked", tl, l); // # nocov
Expand Down Expand Up @@ -310,11 +245,11 @@ SEXP shallowwrapper(SEXP dt, SEXP cols) {
if (!selfrefok(dt, FALSE)) {
int n = isNull(cols) ? length(dt) : length(cols);
return(shallow(dt, cols, n));
} else return(shallow(dt, cols, TRUELENGTH(dt)));
} else return(shallow(dt, cols, R_maxLength(dt)));
}

SEXP truelength(SEXP x) {
return ScalarInteger(isNull(x) ? 0 : TRUELENGTH(x));
return ScalarInteger(isNull(x) || !R_isResizable(x) ? 0 : R_maxLength(x));
}

SEXP selfrefokwrapper(SEXP x, SEXP verbose) {
Expand Down Expand Up @@ -509,10 +444,10 @@ SEXP assign(SEXP dt, SEXP rows, SEXP cols, SEXP newcolnames, SEXP values)
// modify DT by reference. Other than if new columns are being added and the allocVec() fails with
// out-of-memory. In that case the user will receive hard halt and know to rerun.
if (length(newcolnames)) {
oldtncol = TRUELENGTH(dt); // TO DO: oldtncol can be just called tl now, as we won't realloc here any more.
if (!R_isResizable(dt)) error(_("This data.table has either been loaded from disk (e.g. using readRDS()/load()) or constructed manually (e.g. using structure()). Please run setDT() or setalloccol() on it first (to pre-allocate space for new columns) before assigning by reference to it.")); // #2996
oldtncol = R_maxLength(dt); // TO DO: oldtncol can be just called tl now, as we won't realloc here any more.

if (oldtncol<oldncol) {
if (oldtncol==0) error(_("This data.table has either been loaded from disk (e.g. using readRDS()/load()) or constructed manually (e.g. using structure()). Please run setDT() or setalloccol() on it first (to pre-allocate space for new columns) before assigning by reference to it.")); // #2996
internal_error(__func__, "oldtncol(%d) < oldncol(%d)", oldtncol, oldncol); // # nocov
}
if (oldtncol>oldncol+10000L) warning(_("truelength (%d) is greater than 10,000 items over-allocated (length = %d). See ?truelength. If you didn't set the datatable.alloccol option very large, please report to data.table issue tracker including the result of sessionInfo()."),oldtncol, oldncol);
Expand All @@ -522,13 +457,14 @@ SEXP assign(SEXP dt, SEXP rows, SEXP cols, SEXP newcolnames, SEXP values)
error(_("It appears that at some earlier point, names of this data.table have been reassigned. Please ensure to use setnames() rather than names<- or colnames<-. Otherwise, please report to data.table issue tracker.")); // # nocov
// Can growVector at this point easily enough, but it shouldn't happen in first place so leave it as
// strong error message for now.
else if (TRUELENGTH(names) != oldtncol)
else if (R_maxLength(names) != oldtncol)
// Use (long long) to cast R_xlen_t to a fixed type to robustly avoid -Wformat compiler warnings, see #5768, PRId64 didn't work
internal_error(__func__, "selfrefnames is ok but tl names [%lld] != tl [%d]", (long long)TRUELENGTH(names), oldtncol); // # nocov
internal_error(__func__, "selfrefnames is ok but maxLength(names) [%lld] != maxLength(dt) [%d]", (long long)R_maxLength(names), oldtncol); // # nocov
if (!selfrefok(dt, verbose)) // #6410 setDT(dt) and subsequent attr<- can lead to invalid selfref
error(_("It appears that at some earlier point, attributes of this data.table have been reassigned. Please use setattr(DT, name, value) rather than attr(DT, name) <- value. If that doesn't apply to you, please report your case to the data.table issue tracker."));
SETLENGTH(dt, oldncol+LENGTH(newcolnames));
SETLENGTH(names, oldncol+LENGTH(newcolnames));
R_resizeVector(dt, oldncol+LENGTH(newcolnames));
R_resizeVector(names, oldncol+LENGTH(newcolnames));
setAttrib(dt, R_NamesSymbol, names);
for (int i=0; i<LENGTH(newcolnames); ++i)
SET_STRING_ELT(names,oldncol+i,STRING_ELT(newcolnames,i));
// truelengths of both already set by alloccol
Expand Down Expand Up @@ -726,8 +662,9 @@ SEXP assign(SEXP dt, SEXP rows, SEXP cols, SEXP newcolnames, SEXP values)
SET_VECTOR_ELT(dt, i, R_NilValue);
SET_STRING_ELT(names, i, NA_STRING); // release reference to the CHARSXP
}
SETLENGTH(dt, ndt-ndelete);
SETLENGTH(names, ndt-ndelete);
R_resizeVector(dt, ndt-ndelete);
R_resizeVector(names, ndt-ndelete);
setAttrib(dt, R_NamesSymbol, names);
if (LENGTH(names)==0) {
// That was last column deleted, leaving NULL data.table, so we need to reset .row_names, so that it really is the NULL data.table.
PROTECT(nullint=allocVector(INTSXP, 0)); protecti++;
Expand Down
20 changes: 19 additions & 1 deletion src/data.table.h
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,22 @@
# endif
#endif

// TODO(R>=4.6.0): remove the SVN revision check
#if R_VERSION < R_Version(4, 6, 0) || R_SVN_REVISION < 89077
# define R_allocResizableVector(type, maxlen) R_allocResizableVector_(type, maxlen)
# define R_duplicateAsResizable(x) R_duplicateAsResizable_(x)
# define R_resizeVector(x, newlen) SETLENGTH(x, newlen)
Copy link
Member

@HughParsonage HughParsonage Dec 7, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should this be guarded by R_isResizable_(x) and new_len <= TRUELENGTH(x), and no attributes?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes. It would be cleaner to guard for them, so we not only catch them via R-devels runner

if (!R_isResizable(x))
  error(_("not a resizable vector"));
if (newlen > XTRUELENGTH(x))
  error(_("'newlen' is too large"));

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think we should introduce extra checks here, this is called in tight loops, and it is better to check before those loops.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What's a better benchmark for this, frollapply(adaptive = TRUE), or foo[bar, by = baz]?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

frolladaptive willa stress more, unless grouping is by unique column, then probably similarly

# define R_maxLength(x) R_maxLength_(x)
static inline R_xlen_t R_maxLength_(SEXP x) {
return IS_GROWABLE(x) ? TRUELENGTH(x) : XLENGTH(x);
}
# define R_isResizable(x) R_isResizable_(x)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does this need

if (ALTREP(x)) return false;

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good catch. We also checked for ALTREP(x) in the proposed version

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We could also teach copyAsPlain to allocate resizable vectors and use it instead of R_duplicateAsResizable, but for now this works.

static inline bool R_isResizable_(SEXP x) {
// IS_GROWABLE also checks for XLENGTH < TRUELENGTH
return (LEVELS(x) & 0x20) && TRUELENGTH(x);
}
#endif

// init.c
extern SEXP char_integer64;
extern SEXP char_ITime;
Expand Down Expand Up @@ -282,7 +298,7 @@ SEXP memcpyVector(SEXP dest, SEXP src, SEXP offset, SEXP size);
SEXP memcpyDT(SEXP dest, SEXP src, SEXP offset, SEXP size);
SEXP memcpyVectoradaptive(SEXP dest, SEXP src, SEXP offset, SEXP size);
SEXP memcpyDTadaptive(SEXP dest, SEXP src, SEXP offset, SEXP size);
SEXP setgrowable(SEXP x);
SEXP copyAsGrowable(SEXP x, SEXP by_column);

// nafill.c
void nafillDouble(double *x, uint_fast64_t nx, unsigned int type, double fill, bool nan_is_na, ans_t *ans, bool verbose);
Expand Down Expand Up @@ -322,6 +338,8 @@ bool perhapsDataTable(SEXP x);
SEXP perhapsDataTableR(SEXP x);
SEXP frev(SEXP x, SEXP copyArg);
NORET void internal_error(const char *call_name, const char *format, ...);
SEXP R_allocResizableVector_(SEXPTYPE type, R_xlen_t maxlen);
SEXP R_duplicateAsResizable_(SEXP x);

// types.c
char *end(char *start);
Expand Down
7 changes: 4 additions & 3 deletions src/dogroups.c
Original file line number Diff line number Diff line change
Expand Up @@ -322,8 +322,9 @@ SEXP dogroups(SEXP dt, SEXP dtcols, SEXP groups, SEXP grpcols, SEXP jiscols, SEX
// Even if we could know reliably to switch from allocNAVectorLike to allocVector for slight speedup, user code could still
// contain a switched halt, and in that case we'd want the groups not yet done to have NA rather than 0 or uninitialized.
// Increment length only if the allocation passes, #1676. But before SET_VECTOR_ELT otherwise attempt-to-set-index-n/n R error
SETLENGTH(dtnames, LENGTH(dtnames)+1);
SETLENGTH(dt, LENGTH(dt)+1);
R_resizeVector(dtnames, LENGTH(dtnames)+1);
R_resizeVector(dt, LENGTH(dt)+1);
setAttrib(dt, R_NamesSymbol, dtnames);
SET_VECTOR_ELT(dt, colj, target);
UNPROTECT(1);
SET_STRING_ELT(dtnames, colj, STRING_ELT(newnames, colj-origncol));
Expand Down Expand Up @@ -534,7 +535,7 @@ SEXP growVector(SEXP x, const R_len_t newlen)
SEXP newx;
R_len_t len = length(x);
if (isNull(x)) error(_("growVector passed NULL"));
PROTECT(newx = allocVector(TYPEOF(x), newlen)); // TO DO: R_realloc(?) here?
PROTECT(newx = R_allocResizableVector(TYPEOF(x), newlen)); // TO DO: R_realloc(?) here?
if (newlen < len) len=newlen; // i.e. shrink
if (!len) { // cannot memcpy invalid pointer, #6819
keepattr(newx, x);
Expand Down
Loading
Loading