Rdatatable · ben-schwen · Feb 19, 2026 · Feb 19, 2026
@@ -28,7 +28,7 @@ export(tstrsplit)
 export(frank)
 export(frankv)
 export(address)
-export(.SD,.N,.I,.GRP,.NGRP,.BY,.EACHI, measure, measurev, patterns)
+export(.SD,.N,.I,.GRP,.NGRP,.BY,.EACHI,.ROW, measure, measurev, patterns)
 # TODO(#6197): Export these.
 # export(., J)
 export(rleid)

@@ -11,7 +11,7 @@ methods::setPackageName("data.table",.global)
 #   (1) add to man/special-symbols.Rd
 #   (2) export() in NAMESPACE
 #   (3) add to vignettes/datatable-importing.Rmd#globals section
-.SD = .N = .I = .GRP = .NGRP = .BY = .EACHI = NULL
+.SD = .N = .I = .GRP = .NGRP = .BY = .EACHI = .ROW = NULL
 # These are exported to prevent NOTEs from R CMD check, and checkUsage via compiler.
 # But also exporting them makes it clear (to users and other packages) that data.table uses these as symbols.
 # And NULL makes it clear (to the R's mask check on loading) that they're variables not functions.
@@ -1559,6 +1559,19 @@ replace_dot_alias = function(e) {
           names(jsub)=""
           jsub[[1L]]=as.name("list")
         }
+
+        # Check for .ROW := NULL pattern (delete rows by reference)
+        if ((is.character(lhs) && length(lhs)==1L && lhs==".ROW") ||
+            (is.name(lhs) && identical(lhs, quote(.ROW)))) {
+          if (!is.null(jsub) && !identical(jsub, quote(NULL)))
+            stopf(".ROW can only be used with := NULL to delete rows")
+          if (is.null(irows))
+            stopf(".ROW := NULL requires i= condition to specify rows to delete")
+          if (!missingby)
+            stopf(".ROW := NULL does not support 'by' or 'keyby'. To delete rows using by grouping, first compute the row indices (e.g. rows = DT[, .I[cond], by=grp]$V1) and then delete them DT[rows, .ROW := NULL].")
+          .Call(CdeleteRows, x, irows)
+          return(suppPrint(x))
+        }
         av = all.vars(jsub,TRUE)
         if (!is.atomic(lhs)) stopf("LHS of := must be a symbol, or an atomic vector (column names or positions).")
         if (is.character(lhs)) {

@@ -21520,3 +21520,95 @@ test(2365.1, melt(df_melt, id.vars=1:2), melt(dt_melt, id.vars=1:2))
 df_dcast = data.frame(a = c("x", "y"), b = 1:2, v = 3:4)
 dt_dcast = data.table(a = c("x", "y"), b = 1:2, v = 3:4)
 test(2365.2, dcast(df_dcast, a ~ b, value.var = "v"), dcast(dt_dcast, a ~ b, value.var = "v"))
+
+# delete rows by reference #635
+# atomic types and list columns
+dt = data.table(
+  int = 1:5,
+  real = c(1.1, 2.2, 3.3, 4.4, 5.5),
+  char = letters[1:5],
+  lgl = c(TRUE, FALSE, TRUE, FALSE, TRUE),
+  cplx = as.complex(1:5),
+  raw_col = as.raw(1:5),
+  list_col = list(1L, 1:2, 1:3, 1:4, 1:5)
+)
+test(2366.01, copy(dt)[1L, .ROW := NULL], dt[-1])
+test(2366.02, copy(dt)[1, .ROW := NULL], dt[-1])
+test(2366.03, copy(dt)[c(TRUE, FALSE, FALSE, TRUE, FALSE), .ROW := NULL], dt[-c(1,4)])
+test(2366.04, copy(dt)[int==1L, .ROW := NULL], dt[-1])
+test(2366.05, copy(dt)[int<2L, .ROW := NULL], dt[-1])
+test(2366.06, copy(dt)[-1, .ROW := NULL], dt[1])
+# zero row or empty data.tables
+dt = data.table()
+test(2366.07, dt[logical(0), .ROW := NULL], dt)
+dt = data.table(a=integer(0), b=character(0))
+test(2366.08, dt[logical(0), .ROW := NULL], dt)
+# multirow
+dt = data.table(a=1:5, b=letters[1:5])
+test(2366.09, copy(dt)[c(1L, 3L), .ROW := NULL], dt[c(2,4,5)])
+test(2366.11, copy(dt)[1:2, .ROW := NULL], dt[3:5])
+test(2366.12, copy(dt)[1:5, .ROW := NULL], dt[0])
+# NA handling and edges case
+dt = data.table(a=1:5, b=letters[1:5])
+test(2366.13, copy(dt)[c(1L, NA_integer_, 3L), .ROW := NULL], dt[c(2,4,5)])
+test(2366.14, copy(dt)[c(NA_integer_, NA_integer_), .ROW := NULL], dt)
+test(2366.15, copy(dt)[c(TRUE, NA, FALSE, NA, TRUE), .ROW := NULL], dt[c(2,3,4)])
+test(2366.16, copy(dt)[integer(0), .ROW := NULL], dt)
+test(2366.17, copy(dt)[logical(0), .ROW := NULL], dt)
+test(2366.18, copy(dt)[c(FALSE, FALSE, FALSE, FALSE, FALSE), .ROW := NULL], dt)
+test(2366.19, copy(dt)[a > 100, .ROW := NULL], dt)  # no matches
+# Duplicate indices
+dt = data.table(a=1:5, b=letters[1:5])
+test(2366.20, copy(dt)[c(1L, 1L), .ROW := NULL], dt[-1])
+test(2366.21, copy(dt)[c(1L, 1L, 2L, 2L), .ROW := NULL], dt[3:5])
+test(2366.22, copy(dt)[c(3L, 1L, 3L, 1L), .ROW := NULL], dt[c(2,4,5)])
+# integer64
+if (test_bit64) {
+  dt = data.table(a=1:5, b=as.integer64(11:15))
+  test(2366.23, copy(dt)[c(1L, 3L), .ROW := NULL], dt[-c(1L,3L)])
+  test(2366.24, copy(dt)[1:5, .ROW := NULL], data.table(a=integer(0), b=integer64(0)))
+}
+# Date/IDate/ITime columns
+dt = data.table(a=1:5, d=as.Date("2024-01-01") + 0:4, t=as.ITime(paste0(10:14, ":00:00")), dt=as.POSIXct("2024-01-01 12:00:00") + 3600*0:4)
+test(2366.25, copy(dt)[c(1L, 3L), .ROW := NULL], dt[c(2,4,5)])
+test(2366.26, copy(dt)[c(2L, 4L), .ROW := NULL]$d, as.Date("2024-01-01") + c(0,2,4))
+# Factor columns
+dt = data.table(a=1:5, f=factor(letters[1:5], levels=letters[1:10]))
+test(2366.27, copy(dt)[c(1L, 3L), .ROW := NULL], dt[-c(1L,3L)])
+test(2366.28, levels(copy(dt)[c(1L, 3L), .ROW := NULL]$f), letters[1:10])
+dt = data.table(a=1:5, of=ordered(letters[1:5], levels=letters[5:1]))
+test(2366.29, copy(dt)[c(2L, 4L), .ROW := NULL], dt[-c(2L,4L)])
+test(2366.30, is.ordered(copy(dt)[c(2, 4L), .ROW := NULL]$of))
+# Keys - should be cleared after deletion
+dt = data.table(a=5:1, b=letters[1:5], key="a")
+test(2366.31, key(copy(dt)[1L, .ROW := NULL]), NULL)
+test(2366.32, haskey(copy(dt)[1L, .ROW := NULL]), FALSE)
+# Indices - should be cleared after deletion
+dt = data.table(a=1:5, b=letters[1:5], c=5:1)
+setindex(dt, b)
+test(2366.33, indices(copy(dt)[1L, .ROW := NULL]), NULL)
+# row names
+dt = data.table(a=1:5, b=letters[1:5])
+test(2366.34, attr(copy(dt)[c(1L, 3L), .ROW := NULL], "row.names"), 1:3)
+# selfref check
+test(2366.35, selfrefok(copy(dt)[1L, .ROW := NULL]), 1L)
+# errors
+dt = data.table(a=1:4, g=1:2)
+test(2366.36, dt[1L, .ROW := 1L], error=".ROW can only be used with := NULL")
+test(2366.37, dt[1L, .ROW := "delete"], error=".ROW can only be used with := NULL")
+test(2366.38, dt[1L, .ROW := FALSE], error=".ROW can only be used with := NULL")
+test(2366.39, dt[, .ROW := NULL], error=".ROW := NULL requires i= condition")
+test(2366.40, dt[1L, .ROW := NULL, by=g], error=".ROW := NULL does not support 'by' or 'keyby'")
+# large table
+dt = data.table(a=1:20000, b=rep(letters, length.out=20000))
+idx = seq(1L, 20000L, by=2L)
+test(2366.41, copy(dt)[idx, .ROW := NULL], dt[-idx])
+# Chaining and complexer i expressions
+dt = data.table(a=1:10, b=letters[1:10])
+test(2366.42, copy(dt)[a>2, .ROW := NULL][b=="a"], data.table(a=1L, b="a"))
+test(2366.43, copy(dt)[a %% 2 == 0, .ROW := NULL], dt[a %% 2 != 0])
+test(2366.44, copy(dt)[!(a < 5 & b != "d"), .ROW := NULL], dt[1:3])
+# make columns resizable
+dt = data.table(a=1:3)
+test(2366.91, truelength(dt$a), 0L)
+test(2366.92, {setallocrow(dt); truelength(dt$a)}, 3L)
@@ -56,12 +56,16 @@ set(x, i = NULL, j, value)
     DT[i, colC := mean(colB), by = colA]          # update (or add) column called "colC" by reference by group. A major feature of `:=`.
     DT[,`:=`(new1 = sum(colB), new2 = sum(colC))] # Functional form
     DT[, let(new1 = sum(colB), new2 = sum(colC))] # New alias for functional form.
+    DT[i, .ROW := NULL]                           # delete rows by reference.
 }
 
 The \code{\link{.Last.updated}} variable contains the number of rows updated by the most recent \code{:=} or \code{set} calls, which may be useful, for example, in production settings for testing assumptions about the number of rows affected by a statement; see \code{\link{.Last.updated}} for details.
 
 Note that for efficiency no check is performed for duplicate assignments, i.e. if multiple values are passed for assignment to the same index, assignment to this index will occur repeatedly and sequentially; for a given use case, consider whether it makes sense to create your own test for duplicates, e.g. in production code.
 
+Note that \code{.ROW := NULL} is a special case used to delete rows by reference. Unlike column assignment, this requires an \code{i} expression to specify which rows to delete, and does not support \code{by} or \code{keyby}.
+To delete rows using a per-group condition, first compute the indices (\code{rows = DT[, .I[cond], by=grp]$V1}) then delete \code{DT[rows, .ROW := NULL]}. See \code{\link{.ROW}} or \code{\link{special-symbols}} for details.
+
 All of the following result in a friendly error (by design) :
 
 \preformatted{
@@ -158,6 +162,13 @@ set(DT, j = c("b", "d"), value = list(200L, 300L))
 ## Set values for multiple columns with multiple specified rows.
 set(DT, c(1L, 3L), c("b", "d"), value = list(500L, 800L))
 
+# Delete rows by reference
+DT = data.table(a=1:10, b=letters[1:10])
+DT[c(2,4,6), .ROW := NULL]      # delete rows 2, 4, and 6
+DT
+DT[a>5, .ROW := NULL]           # delete rows where a>5
+DT
+
 \dontrun{
 # Speed example:
 

@@ -9,10 +9,12 @@
 \alias{.EACHI}
 \alias{.NGRP}
 \alias{.NATURAL}
+\alias{.ROW}
 \title{ Special symbols }
 \description{
     \code{.SD}, \code{.BY}, \code{.N}, \code{.I}, \code{.GRP}, and \code{.NGRP} are \emph{read-only} symbols for use in \code{j}. \code{.N} can be used in \code{i} as well. \code{.I} can be used in \code{by} as well. See the vignettes, Details and Examples here and in \code{\link{data.table}}.
     \code{.EACHI} is a symbol passed to \code{by}; i.e. \code{by=.EACHI}, \code{.NATURAL} is a symbol passed to \code{on}; i.e. \code{on=.NATURAL}
+    \code{.ROW} is a symbol used with \code{:= NULL} to delete rows by reference. See also \code{\link{assign}}.
 }
 \details{
     The bindings of these variables are locked and attempting to assign to them will generate an error. If you wish to manipulate \code{.SD} before returning it, take a \code{\link{copy}(.SD)} first (see FAQ 4.5). Using \code{:=} in the \code{j} of \code{.SD} is reserved for future use as a (tortuously) flexible way to update \code{DT} by reference by group (even when groups are not contiguous in an ad hoc by).
@@ -32,6 +34,8 @@
 
     \code{.NATURAL} is defined as \code{NULL} but its value is not used. Its usage is \code{on=.NATURAL} (alternative of \code{X[on=Y]}) which joins two tables on their common column names, performing a natural join; see \code{\link{data.table}}'s \code{on} argument for more details.
 
+    \code{.ROW} is a symbol that can only be used with \code{:= NULL} to delete rows by reference. When you use \code{DT[i, .ROW := NULL]}, the rows matching the \code{i} expression are removed from \code{DT} in-place. This is an efficient way to delete rows without copying the entire data.table. The \code{i} argument is required and \code{by}/\code{keyby} are not supported. After deletion, any keys and indices on \code{DT} are cleared. See \code{\link{:=}} for more on reference semantics.
+
     Note that \code{.N} in \code{i} is computed up-front, while that in \code{j} applies \emph{after filtering in \code{i}}. That means that even absent grouping, \code{.N} in \code{i} can be different from \code{.N} in \code{j}. See Examples.
 
     Note also that you should consider these symbols read-only and of limited scope -- internal data.table code might manipulate them in unexpected ways, and as such their bindings are locked. There are subtle ways to wind up with the wrong object, especially when attempting to copy their values outside a grouping context. See examples; when in doubt, \code{copy()} is your friend.
@@ -72,5 +76,12 @@ DT[, .(min(.SD[,-1])), by=.I]
 # Do not expect this to correctly append the value of .BY in each group; copy(.BY) will work.
 by_tracker = list()
 DT[, { append(by_tracker, .BY); sum(v) }, by=x]
+
+# .ROW to delete rows by reference
+DT = data.table(a=1:5, b=letters[1:5])
+DT[c(2,4), .ROW := NULL]
+DT
+DT[a>2, .ROW := NULL]
+DT
 }
 \keyword{ data }
@@ -333,6 +333,9 @@ void copyVectorElements(SEXP dst, SEXP src, R_xlen_t n, bool deep_copy, const ch
 SEXP copyAsPlain(SEXP x, R_xlen_t overalloc);
 SEXP allocrow(SEXP dt, R_xlen_t n);
 void copySharedColumns(SEXP x);
+
+// deleterows.c
+SEXP deleteRows(SEXP dt, SEXP rows_to_delete);
 SEXP lock(SEXP x);
 SEXP unlock(SEXP x);
 bool islocked(SEXP x);

@@ -0,0 +1,179 @@
+#include "data.table.h"
+
+static void computePrefixSum(const int *keep, int *dest, R_xlen_t n, int nthreads);
+static void compactVectorRaw(SEXP col, const int *dest, const int *keep, R_xlen_t new_nrow, R_xlen_t old_nrow);
+
+SEXP deleteRows(SEXP dt, SEXP rows_to_delete) {
+  if (!isNewList(dt))
+    error("Internal error: deleteRows received non-list dt"); // #nocov
+  if (!xlength(dt)) return dt; // zero-column data.table
+
+  const R_xlen_t ncol = length(dt);
+  const R_xlen_t old_nrow = length(VECTOR_ELT(dt, 0));
+  int nprotect = 0;
+
+  if (old_nrow == 0) return dt;
+
+  if (!isInteger(rows_to_delete) && !isLogical(rows_to_delete))
+    internal_error(__func__, "rows_to_delete must be logical, integer, or numeric"); // #nocov
+
+  int *keep = (int *)R_alloc(old_nrow, sizeof(int));
+  const R_xlen_t n = length(rows_to_delete);
+  for (R_xlen_t i = 0; i < old_nrow; i++) keep[i] = 1;
+  int *idx = INTEGER(rows_to_delete);
+  for (R_xlen_t j = 0; j < n; j++) {
+    if (idx[j] == NA_INTEGER) continue;
+    // should be checked from irows in [
+    if (idx[j] < 1 || idx[j] > old_nrow) internal_error(__func__, "Row index %d out of range [1, %lld]", idx[j], (long long)old_nrow); //# nocov
+    keep[idx[j] - 1] = 0;
+  }
+
+  R_xlen_t new_nrow = 0;
+  for (R_xlen_t i = 0; i < old_nrow; i++) new_nrow += keep[i];
+  if (new_nrow == old_nrow) return dt;
+
+  int *dest = (int *)R_alloc(old_nrow, sizeof(int));
+  const int nthreads = getDTthreads(old_nrow, true);
+  computePrefixSum(keep, dest, old_nrow, nthreads);
+
+  // Compact each column
+  for (R_xlen_t j = 0; j < ncol; j++) {
+    SEXP col = VECTOR_ELT(dt, j);
+    if (!R_isResizable(col)) {
+      // catered for ALTREP above
+      SEXP newcol = PROTECT(copyAsPlain(col, 0)); nprotect++;
+      SET_VECTOR_ELT(dt, j, newcol);
+      col = newcol;
+    }
+    compactVectorRaw(col, dest, keep, new_nrow, old_nrow);
+    R_resizeVector(col, new_nrow);
+    SET_VECTOR_ELT(dt, j, col);
+  }
+
+  SEXP rownames = PROTECT(getAttrib(dt, R_RowNamesSymbol)); nprotect++;
+  if (!isNull(rownames)) {
+    // create them from scratch like in dogroups or subset to avoid R internal issues
+    SEXP rn = PROTECT(allocVector(INTSXP, 2)); nprotect++;
+    INTEGER(rn)[0] = NA_INTEGER;
+    INTEGER(rn)[1] = -(int)new_nrow;
+    setAttrib(dt, R_RowNamesSymbol, rn);
+  }
+
+  // Clear key and indices
+  setAttrib(dt, install("sorted"), R_NilValue);
+  setAttrib(dt, install("index"), R_NilValue);
+
+  UNPROTECT(nprotect);
+  return dt;
+}
+
+// Parallel prefix sum (exclusive scan)
+// Two-pass algorithm: first count per thread, then scan, then local prefix sum
+static void computePrefixSum(const int *keep, int *dest, R_xlen_t n, int nthreads) {
+  if (nthreads == 1) {
+    // Sequential version
+    int sum = 0;
+    for (R_xlen_t i = 0; i < n; i++) {
+      dest[i] = sum;
+      sum += keep[i];
+    }
+    return;
+  }
+
+  // Parallel version with two passes
+  int *thread_counts = (int *)R_alloc(nthreads, sizeof(int));
+
+  // Pass 1: Count keeps per thread
+  #pragma omp parallel num_threads(nthreads)
+  {
+    const int tid = omp_get_thread_num();
+    const R_xlen_t chunk_size = (n + nthreads - 1) / nthreads;
+    const R_xlen_t start = tid * chunk_size;
+    const R_xlen_t end = (start + chunk_size > n) ? n : start + chunk_size;
+
+    int local_count = 0;
+    for (R_xlen_t i = start; i < end; i++) {
+      local_count += keep[i];
+    }
+    thread_counts[tid] = local_count;
+  }
+
+  // Sequential scan of thread counts to get offsets
+  int *thread_offsets = (int *)R_alloc(nthreads, sizeof(int));
+  thread_offsets[0] = 0;
+  for (int t = 1; t < nthreads; t++) {
+    thread_offsets[t] = thread_offsets[t-1] + thread_counts[t-1];
+  }
+
+  // Pass 2: Compute local prefix sum with offset
+  #pragma omp parallel num_threads(nthreads)
+  {
+    const int tid = omp_get_thread_num();
+    const R_xlen_t chunk_size = (n + nthreads - 1) / nthreads;
+    const R_xlen_t start = tid * chunk_size;
+    const R_xlen_t end = (start + chunk_size > n) ? n : start + chunk_size;
+
+    int local_sum = thread_offsets[tid];
+    for (R_xlen_t i = start; i < end; i++) {
+      dest[i] = local_sum;
+      local_sum += keep[i];
+    }
+  }
+}
+
+#define COMPACT(CTYPE, ACCESSOR) {                                     \
+  CTYPE *p = ACCESSOR(col);                                            \
+  R_xlen_t i = 0;                                                      \
+  while (i < old_nrow) {                                               \
+    if (!keep[i]) {                                                    \
+      i++;                                                             \
+      continue;                                                        \
+    }                                                                  \
+    R_xlen_t run_start = i;                                            \
+    int target_idx = dest[i];                                          \
+    while (i < old_nrow && keep[i]) i++;                               \
+    size_t run_len = i - run_start;                                    \
+    if (target_idx != run_start) {                                     \
+      memmove(p + target_idx, p + run_start, run_len * sizeof(CTYPE)); \
+    }                                                                  \
+  }                                                                    \
+}
+
+
+// Type-specific stream compaction
+static void compactVectorRaw(SEXP col, const int *dest, const int *keep,
+                             R_xlen_t new_nrow, R_xlen_t old_nrow) {
+  switch(TYPEOF(col)) {
+    case INTSXP:
+    case LGLSXP: {
+      COMPACT(int, INTEGER);
+      break;
+    }
+    case REALSXP: {
+      COMPACT(double, REAL);
+      break;
+    }
+    case CPLXSXP: {
+      COMPACT(Rcomplex, COMPLEX);
+      break;
+    }
+    case RAWSXP: {
+      COMPACT(Rbyte, RAW);
+      break;
+    }
+    case STRSXP: {
+      for (R_xlen_t i = 0; i < old_nrow; i++) {
+        if (keep[i]) SET_STRING_ELT(col, dest[i], STRING_ELT(col, i));
+      }
+      break;
+    }
+    case VECSXP: {
+      for (R_xlen_t i = 0; i < old_nrow; i++) {
+        if (keep[i]) SET_VECTOR_ELT(col, dest[i], VECTOR_ELT(col, i));
+      }
+      break;
+    }
+    default:
+      error("Unsupported column type %s", type2char(TYPEOF(col))); // #nocov
+  }
+}
@@ -96,6 +96,7 @@ static const R_CallMethodDef callMethods[] = {
     {"Cfrank", (DL_FUNC)&frank, -1},
     {"Cdt_na", (DL_FUNC)&dt_na, -1},
     {"Callocrowwrapper", (DL_FUNC)&allocrowwrapper, 2},
+    {"CdeleteRows", (DL_FUNC)&deleteRows, 2},
     {"Clookup", (DL_FUNC)&lookup, -1},
     {"Coverlaps", (DL_FUNC)&overlaps, -1},
     {"Cwhichwrapper", (DL_FUNC)&whichwrapper, -1},