From 163e4e41d174cdd9ca589013085a3b940e1bfaf8 Mon Sep 17 00:00:00 2001 From: mori49 <84979219+mori49@users.noreply.github.com> Date: Fri, 23 Jan 2026 17:25:25 +0100 Subject: [PATCH 01/35] wip:SYSTEMDS-3543 --- .../compress/colgroup/ColGroupFactory.java | 182 ++++++++++++++++++ ...ColGroupPiecewiseLinearCompressedTest.java | 70 +++++++ 2 files changed, 252 insertions(+) create mode 100644 src/test/java/org/apache/sysds/runtime/compress/colgroup/ColGroupPiecewiseLinearCompressedTest.java diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupFactory.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupFactory.java index c6a098f5c32..778aeb1adb8 100644 --- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupFactory.java +++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupFactory.java @@ -50,6 +50,7 @@ import org.apache.sysds.runtime.compress.colgroup.mapping.MapToFactory; import org.apache.sysds.runtime.compress.colgroup.offset.AOffset; import org.apache.sysds.runtime.compress.colgroup.offset.OffsetFactory; +import org.apache.sysds.runtime.compress.colgroup.scheme.ColGroupPiecewiseLinearCompressed; import org.apache.sysds.runtime.compress.cost.ACostEstimate; import org.apache.sysds.runtime.compress.estim.CompressedSizeInfo; import org.apache.sysds.runtime.compress.estim.CompressedSizeInfoColGroup; @@ -936,6 +937,187 @@ private static AColGroup compressLinearFunctional(IColIndex colIndexes, MatrixBl return ColGroupLinearFunctional.create(colIndexes, coefficients, numRows); } + public static AColGroup compressPiecewiseLinearFunctional(IColIndex colIndexes, MatrixBlock in, CompressionSettings cs) { + + + //Erstmal den Inhalt einer Spalte speichern + + int numRows = in.getNumRows(); + int colIdx = colIndexes.get(0); //Die erste Spalte + double[] column = getColumn(in,colIdx); + + //Sette den Targetloss + + // Breakpoints bestimmen: Einteilung der Segmente + + List breakpointsList = computeBreakpoints(cs, column); + int[] breakpoints = breakpointsList.stream().mapToInt(Integer::intValue).toArray(); + //Für jedes Segment lineare Regression als kompressionsverfahren + + // 3) Pro Segment Regression -> a,b + int numSeg = breakpoints.length - 1; + double[] slopes = new double[numSeg]; + double[] intercepts = new double[numSeg]; + + for (int s = 0; s < numSeg; s++) { + int start = breakpoints[s]; + int end = breakpoints[s + 1]; + + double[] ab = regressSegment(column, start, end); // nutzt gleiche Stats wie computeSegmentCost + slopes[s] = ab[0]; + intercepts[s] = ab[1]; + } + //Erstelle die Datenstruktur: PiecewiseLinearColGroupCompressed + + return ColGroupPiecewiseLinearCompressed.create( + colIndexes, + breakpoints, + slopes, + intercepts, + numRows); + } + + + public static double[] getColumn(MatrixBlock in, int colIndex) { + int numRows = in.getNumRows(); // Anzahl der Zeilen [web:16] + double[] column = new double[numRows]; // Variable für die Spalte + + for (int r = 0; r < numRows; r++) { + column[r] = in.get(r, colIndex); // Wert (r, colIndex) lesen [web:16][web:25] + } + return column; + } + public static List computeBreakpoints(CompressionSettings cs, double[] column){ + int n = column.length; + double targetMSE = cs.getPiecewiseTargetLoss(); // nur lesen, NICHT setzen! + + // Fall A: kein TargetLoss angegeben -> einfache Variante mit fixem λ + if (Double.isNaN(targetMSE) || targetMSE <= 0) { + double lambda = 5.0; + return computeBreakpointsLambda(column, lambda); + } + + // Fall B: TargetLoss gesetzt -> globales Fehlerbudget berücksichtigen + double sseMax = n * targetMSE; // MSE -> SSE-Budget + + double lambdaMin = 0.0; // viele Segmente, minimaler Fehler + double lambdaMax = 1e6; // wenige Segmente, mehr Fehler + + List bestBreaks = null; + + for (int it = 0; it < 20; it++) { // Binärsuche auf λ + double lambda = 0.5 * (lambdaMin + lambdaMax); + + List breaks = computeBreakpointsLambda(column, lambda); + double totalSSE = computeTotalSSE(column, breaks); + + if (totalSSE <= sseMax) { + // Budget eingehalten: wir können versuchen, mit größerem λ noch weniger Segmente zu nehmen + bestBreaks = breaks; + lambdaMin = lambda; + } else { + // Fehler zu groß: λ verkleinern, mehr Segmente zulassen + lambdaMax = lambda; + } + } + + if (bestBreaks == null) + bestBreaks = computeBreakpointsLambda(column, lambdaMin); + + return bestBreaks; + } + public static List computeBreakpointsLambda(double[] column, double lambda) { + int sizeColumn = column.length; + double[] dp = new double[sizeColumn + 1]; + int[] prev = new int[sizeColumn + 1]; + + dp[0] = 0.0; + + for (int index = 1; index <= sizeColumn; index++) { + dp[index] = Double.POSITIVE_INFINITY; + for (int i = 0; i < index; i++) { // Segment [i, index) + double costCurrentSegment = computeSegmentCost(column, i, index); // SSE + double candidateCost = dp[i] + costCurrentSegment + lambda; + if (candidateCost < dp[index]) { + dp[index] = candidateCost; + prev[index] = i; + } + } + } + + List segmentLimits = new ArrayList<>(); + int breakpointIndex = sizeColumn; + while (breakpointIndex > 0) { + segmentLimits.add(breakpointIndex); + breakpointIndex = prev[breakpointIndex]; + } + segmentLimits.add(0); + Collections.sort(segmentLimits); + return segmentLimits; + } + + public static double computeSegmentCost(double[] column, int start, int end) { + int n = end - start; + if (n <= 1) + return 0.0; + + double[] ab = regressSegment(column, start, end); + double slope = ab[0]; + double intercept = ab[1]; + + double sse = 0.0; + for (int i = start; i < end; i++) { + double x = i; + double y = column[i]; + double yhat = slope * x + intercept; + double diff = y - yhat; + sse += diff * diff; + } + return sse; // oder sse / n als MSE + } + public static double computeTotalSSE(double[] column, List breaks) { + double total = 0.0; + for (int s = 0; s < breaks.size() - 1; s++) { + int start = breaks.get(s); + int end = breaks.get(s + 1); + total += computeSegmentCost(column, start, end); // SSE des Segments + } + return total; + } + + + public static double[] regressSegment(double[] column, int start, int end) { + int n = end - start; + if (n <= 0) + return new double[] {0.0, 0.0}; + + double sumX = 0, sumY = 0, sumXX = 0, sumXY = 0; + for (int i = start; i < end; i++) { + double x = i; + double y = column[i]; + sumX += x; + sumY += y; + sumXX += x * x; + sumXY += x * y; + } + + double nD = n; + double denom = nD * sumXX - sumX * sumX; + double slope, intercept; + if (denom == 0) { + slope = 0.0; + intercept = sumY / nD; + } + else { + slope = (nD * sumXY - sumX * sumY) / denom; + intercept = (sumY - slope * sumX) / nD; + } + return new double[] {slope, intercept}; + } + + + + private AColGroup compressSDCFromSparseTransposedBlock(IColIndex cols, int nrUniqueEstimate, double tupleSparsity) { if(cols.size() > 1) return compressMultiColSDCFromSparseTransposedBlock(cols, nrUniqueEstimate, tupleSparsity); diff --git a/src/test/java/org/apache/sysds/runtime/compress/colgroup/ColGroupPiecewiseLinearCompressedTest.java b/src/test/java/org/apache/sysds/runtime/compress/colgroup/ColGroupPiecewiseLinearCompressedTest.java new file mode 100644 index 00000000000..3e13c5756ac --- /dev/null +++ b/src/test/java/org/apache/sysds/runtime/compress/colgroup/ColGroupPiecewiseLinearCompressedTest.java @@ -0,0 +1,70 @@ +package org.apache.sysds.runtime.compress.colgroup; + +import org.apache.sysds.runtime.compress.CompressionSettings; +import org.apache.sysds.runtime.compress.CompressionSettingsBuilder; +import org.apache.sysds.runtime.compress.colgroup.indexes.ArrayIndex; +import org.apache.sysds.runtime.compress.colgroup.indexes.IColIndex; +import org.apache.sysds.runtime.compress.colgroup.scheme.ColGroupPiecewiseLinearCompressed; +import org.apache.sysds.runtime.compress.colgroup.ColGroupFactory; + +import org.junit.Test; +import org.junit.jupiter.api.BeforeEach; + +import java.util.Arrays; +import java.util.List; + +import static org.apache.sysds.runtime.compress.colgroup.ColGroupFactory.computeBreakpoints; +import static org.junit.Assert.*; + +/** + * Tests für PiecewiseLinearColGroupCompressed, fokussiert auf: + * - Konstruktor / create(...) + * - decompressToDenseBlock(...) + */ +public class ColGroupPiecewiseLinearCompressedTest { + + private CompressionSettings cs; + // ------------------------------------------------------------- + // 1. create(...) und Konstruktor + // ------------------------------------------------------------- + + @BeforeEach + void setUp() { + CompressionSettings cs = new CompressionSettingsBuilder().create(); + + } + + @Test + public void testComputeBreakpoints_uniformColumn() { + cs.setPiecewiseTargetLoss(1e-3); + double[] column = {1.0, 1.0, 1.0, 1.0, 1.0}; // ← Test-spezifisch + List breaks = computeBreakpoints(cs, column); + assertEquals(Arrays.asList(0), breaks); // Erwartet: keine Breaks + } + + @Test + public void testComputeBreakpoints_linearIncreasing() { + cs.setPiecewiseTargetLoss(1e-3); + double[] column = {0.0, 1.0, 2.0, 3.0, 4.0}; // ← andere column + List breaks = computeBreakpoints(cs, column); + assertEquals(Arrays.asList(0, 2), breaks); // Erwartet + } + + @Test + public void testComputeBreakpoints_highLoss_uniform() { + cs.setPiecewiseTargetLoss(1.0); // ← andere Loss + double[] column = {1.0, 1.0, 1.0, 1.0, 1.0}; + List breaks = computeBreakpoints(cs, column); + assertEquals(Arrays.asList(0), breaks); + } + + @Test + public void testComputeBreakpoints_noLoss_linear() { + cs.setPiecewiseTargetLoss(0.0); + double[] column = {0.0, 1.0, 2.0, 3.0, 4.0}; + List breaks = computeBreakpoints(cs, column); + assertEquals(Arrays.asList(0, 1, 2, 3), breaks); // bei 0 Loss alle Breaks + } + + +} \ No newline at end of file From f5df4eac8135b49e6a4b0f840ed4dd56b9d2f029 Mon Sep 17 00:00:00 2001 From: mori49 <84979219+mori49@users.noreply.github.com> Date: Fri, 23 Jan 2026 22:17:59 +0100 Subject: [PATCH 02/35] =?UTF-8?q?Meine=20lokalen=20=C3=84nderungen?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- bin/systemds-standalone.sh | 12 + pom.xml | 8 +- .../runtime/compress/CompressionSettings.java | 29 +- .../compress/CompressionSettingsBuilder.java | 13 +- .../ColGroupPiecewiseLinearCompressed.java | 371 ++++++++++++++++++ .../colgroup/ColGroupFactoryTest.java | 5 + use-java17-systemds.sh | 57 +++ 7 files changed, 488 insertions(+), 7 deletions(-) create mode 100755 bin/systemds-standalone.sh create mode 100644 src/main/java/org/apache/sysds/runtime/compress/colgroup/scheme/ColGroupPiecewiseLinearCompressed.java create mode 100755 use-java17-systemds.sh diff --git a/bin/systemds-standalone.sh b/bin/systemds-standalone.sh new file mode 100755 index 00000000000..9efaa963a4b --- /dev/null +++ b/bin/systemds-standalone.sh @@ -0,0 +1,12 @@ +#!/bin/bash +# Standalone-Launcher für SystemDS + +SCRIPT_DIR=$(cd "$(dirname "$0")" && pwd) +JAR_FILE="$SCRIPT_DIR/../target/systemds-3.4.0-SNAPSHOT.jar" + +if [ ! -f "$JAR_FILE" ]; then + echo "ERROR: Standalone JAR nicht gefunden: $JAR_FILE" + exit 1 +fi + +java -cp "$JAR_FILE" org.apache.sysds.api.DMLScript "$@" diff --git a/pom.xml b/pom.xml index e0b3f794272..c0221cd11d5 100644 --- a/pom.xml +++ b/pom.xml @@ -1548,5 +1548,11 @@ fastdoubleparser 0.9.0 - + + org.junit.jupiter + junit-jupiter + RELEASE + test + + diff --git a/src/main/java/org/apache/sysds/runtime/compress/CompressionSettings.java b/src/main/java/org/apache/sysds/runtime/compress/CompressionSettings.java index f6321bc1b6d..c5d98019947 100644 --- a/src/main/java/org/apache/sysds/runtime/compress/CompressionSettings.java +++ b/src/main/java/org/apache/sysds/runtime/compress/CompressionSettings.java @@ -21,6 +21,7 @@ import java.util.EnumSet; +import com.fasterxml.jackson.annotation.JsonAnySetter; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.sysds.runtime.compress.cocode.CoCoderFactory.PartitionerType; @@ -39,6 +40,22 @@ public class CompressionSettings { /** Parallelization threshold for DDC compression */ public static int PAR_DDC_THRESHOLD = 10000; + /** + * Ziel-Gesamtverlust für piecewise lineare Kompression. + * Interpretation: maximal erlaubter globaler MSE pro Wert in der Spalte. + * 0.0 ~ quasi verlustfrei, viele Segmente + * >0 ~ mehr Approximation erlaubt, weniger Segmente + */ + + private double piecewiseTargetLoss = Double.NaN; + + public void setPiecewiseTargetLoss(double piecewiseTargetLoss) { + this.piecewiseTargetLoss = piecewiseTargetLoss; + } + public double getPiecewiseTargetLoss() { + return piecewiseTargetLoss; + } + /** * Size of the blocks used in a blocked bitmap representation. Note it is exactly Character.MAX_VALUE. This is not * Character max value + 1 because it breaks the offsets in cases with fully dense values. @@ -133,11 +150,11 @@ public class CompressionSettings { public final double[] scaleFactors; - protected CompressionSettings(double samplingRatio, double samplePower, boolean allowSharedDictionary, - String transposeInput, int seed, boolean lossy, EnumSet validCompressions, - boolean sortValuesByLength, PartitionerType columnPartitioner, int maxColGroupCoCode, double coCodePercentage, - int minimumSampleSize, int maxSampleSize, EstimationType estimationType, CostType costComputationType, - double minimumCompressionRatio, boolean isInSparkInstruction, SORT_TYPE sdcSortType, double[] scaleFactors) { + public CompressionSettings(double samplingRatio, double samplePower, boolean allowSharedDictionary, + String transposeInput, int seed, boolean lossy, EnumSet validCompressions, + boolean sortValuesByLength, PartitionerType columnPartitioner, int maxColGroupCoCode, double coCodePercentage, + int minimumSampleSize, int maxSampleSize, EstimationType estimationType, CostType costComputationType, + double minimumCompressionRatio, boolean isInSparkInstruction, SORT_TYPE sdcSortType, double[] scaleFactors) { this.samplingRatio = samplingRatio; this.samplePower = samplePower; this.allowSharedDictionary = allowSharedDictionary; @@ -181,4 +198,6 @@ public String toString() { sb.append("\t Estimation Type: " + estimationType); return sb.toString(); } + + } diff --git a/src/main/java/org/apache/sysds/runtime/compress/CompressionSettingsBuilder.java b/src/main/java/org/apache/sysds/runtime/compress/CompressionSettingsBuilder.java index ae6a0b2d231..00375753d6f 100644 --- a/src/main/java/org/apache/sysds/runtime/compress/CompressionSettingsBuilder.java +++ b/src/main/java/org/apache/sysds/runtime/compress/CompressionSettingsBuilder.java @@ -54,7 +54,7 @@ public class CompressionSettingsBuilder { private SORT_TYPE sdcSortType = SORT_TYPE.MATERIALIZE; private double[] scaleFactors = null; - public CompressionSettingsBuilder() { + public CompressionSettingsBuilder() { DMLConfig conf = ConfigurationManager.getDMLConfig(); this.lossy = conf.getBooleanValue(DMLConfig.COMPRESSED_LOSSY); @@ -210,6 +210,17 @@ public CompressionSettingsBuilder addValidCompression(CompressionType cp) { return this; } + /** + * Ziel-Gesamtverlust für piecewise lineare Kompression. + * Interpretation: maximal erlaubter globaler MSE pro Wert in der Spalte. + * 0.0 ~ quasi verlustfrei, viele Segmente + * >0 ~ mehr Approximation erlaubt, weniger Segmente + + + public void setPiecewiseTargetLoss(double piecewiseTargetLoss) { + this.piecewiseTargetLoss = piecewiseTargetLoss; + }*/ + /** * Clear all the compression types allowed in the compression. This will only allow the Uncompressed ColGroup type. * Since this is required for operation of the compression diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/scheme/ColGroupPiecewiseLinearCompressed.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/scheme/ColGroupPiecewiseLinearCompressed.java new file mode 100644 index 00000000000..e9e4cd1572b --- /dev/null +++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/scheme/ColGroupPiecewiseLinearCompressed.java @@ -0,0 +1,371 @@ +package org.apache.sysds.runtime.compress.colgroup.scheme; + +import org.apache.sysds.runtime.compress.colgroup.AColGroup; +import org.apache.sysds.runtime.compress.colgroup.AColGroupCompressed; +import org.apache.sysds.runtime.compress.colgroup.ColGroupUtils; +import org.apache.sysds.runtime.compress.colgroup.indexes.IColIndex; +import org.apache.sysds.runtime.compress.cost.ComputationCostEstimator; +import org.apache.sysds.runtime.compress.estim.CompressedSizeInfoColGroup; +import org.apache.sysds.runtime.data.DenseBlock; +import org.apache.sysds.runtime.data.SparseBlock; +import org.apache.sysds.runtime.data.SparseBlockMCSR; +import org.apache.sysds.runtime.functionobjects.Builtin; +import org.apache.sysds.runtime.instructions.cp.CM_COV_Object; +import org.apache.sysds.runtime.matrix.data.MatrixBlock; +import org.apache.sysds.runtime.matrix.operators.BinaryOperator; +import org.apache.sysds.runtime.matrix.operators.CMOperator; +import org.apache.sysds.runtime.matrix.operators.ScalarOperator; +import org.apache.sysds.runtime.matrix.operators.UnaryOperator; + +import java.util.Arrays; + +public class ColGroupPiecewiseLinearCompressed extends AColGroupCompressed { + + IColIndex colIndexes; + int[] breakpoints; + double[] slopes; + double[] intercepts; + int numRows; + + protected ColGroupPiecewiseLinearCompressed(IColIndex colIndices) { + super(colIndices); + } + + + public ColGroupPiecewiseLinearCompressed(IColIndex colIndexes, int[] breakpoints, double[] slopes, double[] intercepts, int numRows) { + super(colIndexes); + this.breakpoints = breakpoints; + this.slopes = slopes; + this.intercepts = intercepts; + this.numRows = numRows; + } + + + + + + public static AColGroup create(IColIndex colIndexes, int[] breakpoints, double[] slopes, double[] intercepts, int numRows) { + if (breakpoints == null || breakpoints.length < 2) + throw new IllegalArgumentException("Need at least one segment"); + + int numSeg = breakpoints.length - 1; + if (slopes.length != numSeg || intercepts.length != numSeg) + throw new IllegalArgumentException("Inconsistent segment arrays"); + + int[] bpCopy = Arrays.copyOf(breakpoints, breakpoints.length); + double[] slopeCopy = Arrays.copyOf(slopes, slopes.length); + double[] interceptCopy = Arrays.copyOf(intercepts, intercepts.length); + + + return new ColGroupPiecewiseLinearCompressed( + colIndexes, + bpCopy, + slopeCopy, + interceptCopy, + numRows); + + } + + @Override + public void decompressToDenseBlock(DenseBlock db, int rl, int ru, int offR, int offC) { + final int col = colIndexes.get(0); // bei mehreren Spalten: Schleife + + // Hole das interne double[] für die Zielspalte(n) + // DenseBlock ist meist row-major, Zugriff per db.values(…) + // Einfachste Variante: Zeilenweise über db.getBlockValues(...) arbeiten. + + final int numSeg = breakpoints.length - 1; + + for (int s = 0; s < numSeg; s++) { + final int segStart = breakpoints[s]; + final int segEnd = breakpoints[s + 1]; + final double a = slopes[s]; + final double b = intercepts[s]; + + // Segment auf angefragten Bereich einschränken + final int rs = Math.max(segStart, rl); + final int re = Math.min(segEnd, ru); + if (rs >= re) + continue; + + for (int r = rs; r < re; r++) { + double x = r; // selbes x wie beim Fit + double yhat = a * x + b; + + // globale Position im DenseBlock + int gr = r + offR; + int gc = col + offC; + + // db.set(row, col, value) + db.set(gr, gc, yhat); + } + } + } + @Override + protected double computeMxx(double c, Builtin builtin) { + return 0; + } + + @Override + protected void computeColMxx(double[] c, Builtin builtin) { + + } + + @Override + protected void computeSum(double[] c, int nRows) { + + } + + @Override + protected void computeSumSq(double[] c, int nRows) { + + } + + @Override + protected void computeColSumsSq(double[] c, int nRows) { + + } + + @Override + protected void computeRowSums(double[] c, int rl, int ru, double[] preAgg) { + + } + + @Override + protected void computeRowMxx(double[] c, Builtin builtin, int rl, int ru, double[] preAgg) { + + } + + @Override + protected void computeProduct(double[] c, int nRows) { + + } + + @Override + protected void computeRowProduct(double[] c, int rl, int ru, double[] preAgg) { + + } + + @Override + protected void computeColProduct(double[] c, int nRows) { + + } + + @Override + protected double[] preAggSumRows() { + return new double[0]; + } + + @Override + protected double[] preAggSumSqRows() { + return new double[0]; + } + + @Override + protected double[] preAggProductRows() { + return new double[0]; + } + + @Override + protected double[] preAggBuiltinRows(Builtin builtin) { + return new double[0]; + } + + @Override + public boolean sameIndexStructure(AColGroupCompressed that) { + return false; + } + + @Override + protected void tsmm(double[] result, int numColumns, int nRows) { + + } + + @Override + public AColGroup copyAndSet(IColIndex colIndexes) { + return null; + } + + @Override + public void decompressToDenseBlockTransposed(DenseBlock db, int rl, int ru) { + + } + + @Override + public void decompressToSparseBlockTransposed(SparseBlockMCSR sb, int nColOut) { + + } + + @Override + public double getIdx(int r, int colIdx) { + return 0; + } + + @Override + public int getNumValues() { + return 0; + } + + @Override + public CompressionType getCompType() { + return null; + } + + @Override + protected ColGroupType getColGroupType() { + return null; + } + + + + @Override + public void decompressToSparseBlock(SparseBlock sb, int rl, int ru, int offR, int offC) { + + } + + @Override + public AColGroup rightMultByMatrix(MatrixBlock right, IColIndex allCols, int k) { + return null; + } + + @Override + public void leftMultByMatrixNoPreAgg(MatrixBlock matrix, MatrixBlock result, int rl, int ru, int cl, int cu) { + + } + + @Override + public void leftMultByAColGroup(AColGroup lhs, MatrixBlock result, int nRows) { + + } + + @Override + public void tsmmAColGroup(AColGroup other, MatrixBlock result) { + + } + + @Override + public AColGroup scalarOperation(ScalarOperator op) { + return null; + } + + @Override + public AColGroup binaryRowOpLeft(BinaryOperator op, double[] v, boolean isRowSafe) { + return null; + } + + @Override + public AColGroup binaryRowOpRight(BinaryOperator op, double[] v, boolean isRowSafe) { + return null; + } + + @Override + protected AColGroup sliceSingleColumn(int idx) { + return null; + } + + @Override + protected AColGroup sliceMultiColumns(int idStart, int idEnd, IColIndex outputCols) { + return null; + } + + @Override + public AColGroup sliceRows(int rl, int ru) { + return null; + } + + @Override + public boolean containsValue(double pattern) { + return false; + } + + @Override + public long getNumberNonZeros(int nRows) { + return 0; + } + + @Override + public AColGroup replace(double pattern, double replace) { + return null; + } + + @Override + public void computeColSums(double[] c, int nRows) { + + } + + @Override + public CM_COV_Object centralMoment(CMOperator op, int nRows) { + return null; + } + + @Override + public AColGroup rexpandCols(int max, boolean ignore, boolean cast, int nRows) { + return null; + } + + @Override + public double getCost(ComputationCostEstimator e, int nRows) { + return 0; + } + + @Override + public AColGroup unaryOperation(UnaryOperator op) { + return null; + } + + @Override + public AColGroup append(AColGroup g) { + return null; + } + + @Override + protected AColGroup appendNInternal(AColGroup[] groups, int blen, int rlen) { + return null; + } + + @Override + public ICLAScheme getCompressionScheme() { + return null; + } + + @Override + public AColGroup recompress() { + return null; + } + + @Override + public CompressedSizeInfoColGroup getCompressionInfo(int nRow) { + return null; + } + + @Override + protected AColGroup fixColIndexes(IColIndex newColIndex, int[] reordering) { + return null; + } + + @Override + public AColGroup reduceCols() { + return null; + } + + @Override + public double getSparsity() { + return 0; + } + + @Override + protected void sparseSelection(MatrixBlock selection, ColGroupUtils.P[] points, MatrixBlock ret, int rl, int ru) { + + } + + @Override + protected void denseSelection(MatrixBlock selection, ColGroupUtils.P[] points, MatrixBlock ret, int rl, int ru) { + + } + + @Override + public AColGroup[] splitReshape(int multiplier, int nRow, int nColOrg) { + return new AColGroup[0]; + } +} + diff --git a/src/test/java/org/apache/sysds/test/component/compress/colgroup/ColGroupFactoryTest.java b/src/test/java/org/apache/sysds/test/component/compress/colgroup/ColGroupFactoryTest.java index 0468de4dc04..597e065aab6 100644 --- a/src/test/java/org/apache/sysds/test/component/compress/colgroup/ColGroupFactoryTest.java +++ b/src/test/java/org/apache/sysds/test/component/compress/colgroup/ColGroupFactoryTest.java @@ -19,8 +19,10 @@ package org.apache.sysds.test.component.compress.colgroup; +import static org.apache.sysds.runtime.compress.colgroup.ColGroupFactory.computeSegmentCost; import static org.junit.Assert.assertEquals; import static org.junit.Assert.fail; +import static org.junit.jupiter.api.Assertions.assertTrue; import java.util.ArrayList; import java.util.Collection; @@ -51,6 +53,7 @@ @RunWith(value = Parameterized.class) public class ColGroupFactoryTest { + private final MatrixBlock mb; private final MatrixBlock mbt; private final ACostEstimate ce; @@ -328,4 +331,6 @@ public int numBlocks() { return 2; } } + + } diff --git a/use-java17-systemds.sh b/use-java17-systemds.sh new file mode 100755 index 00000000000..0c1a2fda871 --- /dev/null +++ b/use-java17-systemds.sh @@ -0,0 +1,57 @@ +#!/bin/bash +# ------------------------------------------------------------------ +# SystemDS macOS Build-Skript +# Setzt JAVA_HOME, PATH, Maven und erzeugt systemds-standalone.sh +# ------------------------------------------------------------------ + +# 1️⃣ Setze Java 17 +export JAVA_HOME=$(/usr/libexec/java_home -v 17) +export PATH="$JAVA_HOME/bin:/usr/bin:/bin:/usr/sbin:/sbin:/usr/local/bin:/opt/homebrew/bin:/opt/homebrew/sbin:$PATH" + +# 2️⃣ Optional: Python, ghcup, uix/Deno, Coursier, JetBrains Toolbox +export PATH="/Library/Frameworks/Python.framework/Versions/3.11/bin:$HOME/.ghcup/bin:$HOME/.uix/bin:$PATH" +export DENO_INSTALL="$HOME/.uix" +export PATH="$DENO_INSTALL/bin:$PATH" +export PATH="$PATH:/Users/mori/Library/Application Support/Coursier/bin" +export PATH="$PATH:/Users/mori/Library/Application Support/JetBrains/Toolbox/scripts" + +# 3️⃣ Prüfen, ob Maven existiert +if ! command -v mvn >/dev/null 2>&1; then + echo "ERROR: Maven (mvn) nicht gefunden. Bitte installieren!" + exit 1 +fi + +# 4️⃣ Prüfen, ob wir im Projekt-Root sind (pom.xml vorhanden) +if [ ! -f "pom.xml" ]; then + echo "ERROR: pom.xml nicht gefunden. Bitte ins SystemDS-Projekt-Root wechseln." + exit 1 +fi + +# 5️⃣ Maven Build ausführen +echo "📦 Starte Maven Build..." +mvn clean package -DskipTests + +# 6️⃣ Standalone-Skript erzeugen +echo "🔧 Erzeuge bin/systemds-standalone.sh..." + +mkdir -p bin +cat > bin/systemds-standalone.sh << 'EOF' +#!/bin/bash +# Standalone-Launcher für SystemDS + +SCRIPT_DIR=$(cd "$(dirname "$0")" && pwd) +JAR_FILE="$SCRIPT_DIR/../target/systemds-3.4.0-SNAPSHOT.jar" + +if [ ! -f "$JAR_FILE" ]; then + echo "ERROR: Standalone JAR nicht gefunden: $JAR_FILE" + exit 1 +fi + +java -cp "$JAR_FILE" org.apache.sysds.api.DMLScript "$@" +EOF + +# 7️⃣ Ausführbar machen +chmod +x bin/systemds-standalone.sh + +echo "✅ Fertig! Standalone-Skript erstellt: bin/systemds-standalone.sh" + From 8f5c844e5ef074a4ad2d8e267bf146604a3c0bde Mon Sep 17 00:00:00 2001 From: mori49 <84979219+mori49@users.noreply.github.com> Date: Fri, 23 Jan 2026 22:41:46 +0100 Subject: [PATCH 03/35] wip: test --- .../colgroup/ColGroupPiecewiseLinearCompressedTest.java | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/test/java/org/apache/sysds/runtime/compress/colgroup/ColGroupPiecewiseLinearCompressedTest.java b/src/test/java/org/apache/sysds/runtime/compress/colgroup/ColGroupPiecewiseLinearCompressedTest.java index 3e13c5756ac..5a740624d2d 100644 --- a/src/test/java/org/apache/sysds/runtime/compress/colgroup/ColGroupPiecewiseLinearCompressedTest.java +++ b/src/test/java/org/apache/sysds/runtime/compress/colgroup/ColGroupPiecewiseLinearCompressedTest.java @@ -21,6 +21,7 @@ * - Konstruktor / create(...) * - decompressToDenseBlock(...) */ +//TODO Fix public class ColGroupPiecewiseLinearCompressedTest { private CompressionSettings cs; @@ -48,6 +49,7 @@ public void testComputeBreakpoints_linearIncreasing() { double[] column = {0.0, 1.0, 2.0, 3.0, 4.0}; // ← andere column List breaks = computeBreakpoints(cs, column); assertEquals(Arrays.asList(0, 2), breaks); // Erwartet + } @Test From 11415fa06e934a87b7617501b921d40563784098 Mon Sep 17 00:00:00 2001 From: Jannik Lindemann Date: Mon, 26 Jan 2026 10:35:00 +0100 Subject: [PATCH 04/35] Test Fix --- .../runtime/compress/CompressionSettings.java | 8 -------- .../ColGroupPiecewiseLinearCompressed.java | 12 ++++++------ .../ColGroupPiecewiseLinearCompressedTest.java | 18 +++++------------- 3 files changed, 11 insertions(+), 27 deletions(-) diff --git a/src/main/java/org/apache/sysds/runtime/compress/CompressionSettings.java b/src/main/java/org/apache/sysds/runtime/compress/CompressionSettings.java index 71c0a7e4d34..b853fd7f3ef 100644 --- a/src/main/java/org/apache/sysds/runtime/compress/CompressionSettings.java +++ b/src/main/java/org/apache/sysds/runtime/compress/CompressionSettings.java @@ -150,13 +150,6 @@ public double getPiecewiseTargetLoss() { public final double[] scaleFactors; -<<<<<<< HEAD - public CompressionSettings(double samplingRatio, double samplePower, boolean allowSharedDictionary, - String transposeInput, int seed, boolean lossy, EnumSet validCompressions, - boolean sortValuesByLength, PartitionerType columnPartitioner, int maxColGroupCoCode, double coCodePercentage, - int minimumSampleSize, int maxSampleSize, EstimationType estimationType, CostType costComputationType, - double minimumCompressionRatio, boolean isInSparkInstruction, SORT_TYPE sdcSortType, double[] scaleFactors) { -======= public final boolean preferDeltaEncoding; protected CompressionSettings(double samplingRatio, double samplePower, boolean allowSharedDictionary, @@ -165,7 +158,6 @@ protected CompressionSettings(double samplingRatio, double samplePower, boolean int minimumSampleSize, int maxSampleSize, EstimationType estimationType, CostType costComputationType, double minimumCompressionRatio, boolean isInSparkInstruction, SORT_TYPE sdcSortType, double[] scaleFactors, boolean preferDeltaEncoding) { ->>>>>>> upstream/main this.samplingRatio = samplingRatio; this.samplePower = samplePower; this.allowSharedDictionary = allowSharedDictionary; diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/scheme/ColGroupPiecewiseLinearCompressed.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/scheme/ColGroupPiecewiseLinearCompressed.java index e9e4cd1572b..ec63d3bfb4e 100644 --- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/scheme/ColGroupPiecewiseLinearCompressed.java +++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/scheme/ColGroupPiecewiseLinearCompressed.java @@ -10,7 +10,7 @@ import org.apache.sysds.runtime.data.SparseBlock; import org.apache.sysds.runtime.data.SparseBlockMCSR; import org.apache.sysds.runtime.functionobjects.Builtin; -import org.apache.sysds.runtime.instructions.cp.CM_COV_Object; +import org.apache.sysds.runtime.instructions.cp.CmCovObject; import org.apache.sysds.runtime.matrix.data.MatrixBlock; import org.apache.sysds.runtime.matrix.operators.BinaryOperator; import org.apache.sysds.runtime.matrix.operators.CMOperator; @@ -293,12 +293,12 @@ public void computeColSums(double[] c, int nRows) { } - @Override - public CM_COV_Object centralMoment(CMOperator op, int nRows) { - return null; - } + @Override + public CmCovObject centralMoment(CMOperator op, int nRows) { + return null; + } - @Override + @Override public AColGroup rexpandCols(int max, boolean ignore, boolean cast, int nRows) { return null; } diff --git a/src/test/java/org/apache/sysds/runtime/compress/colgroup/ColGroupPiecewiseLinearCompressedTest.java b/src/test/java/org/apache/sysds/runtime/compress/colgroup/ColGroupPiecewiseLinearCompressedTest.java index 5a740624d2d..3335fbe5e7c 100644 --- a/src/test/java/org/apache/sysds/runtime/compress/colgroup/ColGroupPiecewiseLinearCompressedTest.java +++ b/src/test/java/org/apache/sysds/runtime/compress/colgroup/ColGroupPiecewiseLinearCompressedTest.java @@ -23,20 +23,9 @@ */ //TODO Fix public class ColGroupPiecewiseLinearCompressedTest { - - private CompressionSettings cs; - // ------------------------------------------------------------- - // 1. create(...) und Konstruktor - // ------------------------------------------------------------- - - @BeforeEach - void setUp() { - CompressionSettings cs = new CompressionSettingsBuilder().create(); - - } - @Test public void testComputeBreakpoints_uniformColumn() { + CompressionSettings cs = new CompressionSettingsBuilder().create(); cs.setPiecewiseTargetLoss(1e-3); double[] column = {1.0, 1.0, 1.0, 1.0, 1.0}; // ← Test-spezifisch List breaks = computeBreakpoints(cs, column); @@ -45,6 +34,7 @@ public void testComputeBreakpoints_uniformColumn() { @Test public void testComputeBreakpoints_linearIncreasing() { + CompressionSettings cs = new CompressionSettingsBuilder().create(); cs.setPiecewiseTargetLoss(1e-3); double[] column = {0.0, 1.0, 2.0, 3.0, 4.0}; // ← andere column List breaks = computeBreakpoints(cs, column); @@ -54,6 +44,7 @@ public void testComputeBreakpoints_linearIncreasing() { @Test public void testComputeBreakpoints_highLoss_uniform() { + CompressionSettings cs = new CompressionSettingsBuilder().create(); cs.setPiecewiseTargetLoss(1.0); // ← andere Loss double[] column = {1.0, 1.0, 1.0, 1.0, 1.0}; List breaks = computeBreakpoints(cs, column); @@ -62,6 +53,7 @@ public void testComputeBreakpoints_highLoss_uniform() { @Test public void testComputeBreakpoints_noLoss_linear() { + CompressionSettings cs = new CompressionSettingsBuilder().create(); cs.setPiecewiseTargetLoss(0.0); double[] column = {0.0, 1.0, 2.0, 3.0, 4.0}; List breaks = computeBreakpoints(cs, column); @@ -69,4 +61,4 @@ public void testComputeBreakpoints_noLoss_linear() { } -} \ No newline at end of file +} From 5301f8fb61e152bd1e31b7c91b9851d2350cd9ab Mon Sep 17 00:00:00 2001 From: mori49 <84979219+mori49@users.noreply.github.com> Date: Mon, 26 Jan 2026 20:19:12 +0100 Subject: [PATCH 05/35] wip: test --- .../runtime/compress/CompressionSettings.java | 324 +++++---- .../compress/CompressionSettingsBuilder.java | 613 +++++++++--------- .../compress/colgroup/ColGroupFactory.java | 8 +- .../ColGroupPiecewiseLinearCompressed.java | 5 +- ...ColGroupPiecewiseLinearCompressedTest.java | 380 ++++++++++- 5 files changed, 833 insertions(+), 497 deletions(-) diff --git a/src/main/java/org/apache/sysds/runtime/compress/CompressionSettings.java b/src/main/java/org/apache/sysds/runtime/compress/CompressionSettings.java index 71c0a7e4d34..d1f97928975 100644 --- a/src/main/java/org/apache/sysds/runtime/compress/CompressionSettings.java +++ b/src/main/java/org/apache/sysds/runtime/compress/CompressionSettings.java @@ -21,7 +21,6 @@ import java.util.EnumSet; -import com.fasterxml.jackson.annotation.JsonAnySetter; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.sysds.runtime.compress.cocode.CoCoderFactory.PartitionerType; @@ -35,181 +34,174 @@ * CompressionSettingsBuilder for default non static parameters. */ public class CompressionSettings { - private static final Log LOG = LogFactory.getLog(CompressionSettings.class.getName()); + private static final Log LOG = LogFactory.getLog(CompressionSettings.class.getName()); - /** Parallelization threshold for DDC compression */ - public static int PAR_DDC_THRESHOLD = 10000; + /** Parallelization threshold for DDC compression */ + public static int PAR_DDC_THRESHOLD = 10000; /** - * Ziel-Gesamtverlust für piecewise lineare Kompression. - * Interpretation: maximal erlaubter globaler MSE pro Wert in der Spalte. - * 0.0 ~ quasi verlustfrei, viele Segmente - * >0 ~ mehr Approximation erlaubt, weniger Segmente + * Size of the blocks used in a blocked bitmap representation. Note it is exactly Character.MAX_VALUE. This is not + * Character max value + 1 because it breaks the offsets in cases with fully dense values. */ + public static final int BITMAP_BLOCK_SZ = Character.MAX_VALUE; - private double piecewiseTargetLoss = Double.NaN; + /** + * Sorting of values by physical length helps by 10-20%, especially for serial, while slight performance decrease + * for parallel incl multi-threaded, hence not applied for distributed operations (also because compression time + + * garbage collection increases) + */ + public final boolean sortTuplesByFrequency; + + /** + * The sampling ratio used when choosing ColGroups. Note that, default behavior is to use exact estimator if the + * number of elements is below 1000. + * + * DEPRECATED + */ + public final double samplingRatio; + + /** + * The sampling ratio power to use when choosing sample size. This is used in accordance to the function: + * + * sampleSize += nRows^samplePower; + * + * The value is bounded to be in the range of 0 to 1, 1 giving a sample size of everything, and 0 adding 1. + */ + public final double samplePower; + + /** Share DDC Dictionaries between ColGroups. */ + public final boolean allowSharedDictionary; + + /** Boolean specifying which transpose setting is used, can be auto, true or false */ + public final String transposeInput; + + /** If the seed is -1 then the system used system millisecond time and class hash for seeding. */ + public final int seed; + + /** True if lossy compression is enabled */ + public final boolean lossy; + + /** The selected method for column partitioning used in CoCoding compressed columns */ + public final PartitionerType columnPartitioner; + + /** The cost computation type for the compression */ + public final CostType costComputationType; + + /** The maximum number of columns CoCoded allowed */ + public final int maxColGroupCoCode; + + /** + * A Cocode parameter that differ in behavior based on compression method, in general it is a value that reflects + * aggressively likely coCoding is used. + */ + public final double coCodePercentage; + + /** + * Valid Compressions List, containing the ColGroup CompressionTypes that are allowed to be used for the compression + * Default is to always allow for Uncompromisable ColGroup. + */ + public final EnumSet validCompressions; + + /** The minimum size of the sample extracted. */ + public final int minimumSampleSize; + /** The maximum size of the sample extracted. */ + public final int maxSampleSize; + + /** The sample type used for sampling */ + public final EstimationType estimationType; + + /** + * Transpose input matrix, to optimize access when extracting bitmaps. This setting is changed inside the script + * based on the transposeInput setting. + * + * This is intentionally left as a mutable value, since the transposition of the input matrix is decided in phase 3. + */ + public boolean transposed = false; + + /** The minimum compression ratio to achieve. */ + public final double minimumCompressionRatio; + + + + /** Is a spark instruction */ + public final boolean isInSparkInstruction; + + /** The sorting type used in sorting/joining offsets to create SDC groups */ + public final SORT_TYPE sdcSortType; + + /** if the settings have been logged already. */ + public static boolean printedStatus = false; + + public final double[] scaleFactors; + + public final boolean preferDeltaEncoding; + + /** + * Ziel-Gesantverlust für piecewise Lineace Komocession• + * Interpretation: maximal entaubter Alobaler MSE pro Went in der Sealte. + * O.O ~ quasi verlustfrei, viele Segmente + * >0 + ~ mehr Approximation entaubt, weniger Segmente + */ + private double piecewiseTargetLoss = Double.NaN; public void setPiecewiseTargetLoss(double piecewiseTargetLoss) { this.piecewiseTargetLoss = piecewiseTargetLoss; + } public double getPiecewiseTargetLoss() { return piecewiseTargetLoss; } - /** - * Size of the blocks used in a blocked bitmap representation. Note it is exactly Character.MAX_VALUE. This is not - * Character max value + 1 because it breaks the offsets in cases with fully dense values. - */ - public static final int BITMAP_BLOCK_SZ = Character.MAX_VALUE; - - /** - * Sorting of values by physical length helps by 10-20%, especially for serial, while slight performance decrease - * for parallel incl multi-threaded, hence not applied for distributed operations (also because compression time + - * garbage collection increases) - */ - public final boolean sortTuplesByFrequency; - - /** - * The sampling ratio used when choosing ColGroups. Note that, default behavior is to use exact estimator if the - * number of elements is below 1000. - * - * DEPRECATED - */ - public final double samplingRatio; - - /** - * The sampling ratio power to use when choosing sample size. This is used in accordance to the function: - * - * sampleSize += nRows^samplePower; - * - * The value is bounded to be in the range of 0 to 1, 1 giving a sample size of everything, and 0 adding 1. - */ - public final double samplePower; - - /** Share DDC Dictionaries between ColGroups. */ - public final boolean allowSharedDictionary; - - /** Boolean specifying which transpose setting is used, can be auto, true or false */ - public final String transposeInput; - - /** If the seed is -1 then the system used system millisecond time and class hash for seeding. */ - public final int seed; - - /** True if lossy compression is enabled */ - public final boolean lossy; - - /** The selected method for column partitioning used in CoCoding compressed columns */ - public final PartitionerType columnPartitioner; - - /** The cost computation type for the compression */ - public final CostType costComputationType; - - /** The maximum number of columns CoCoded allowed */ - public final int maxColGroupCoCode; - - /** - * A Cocode parameter that differ in behavior based on compression method, in general it is a value that reflects - * aggressively likely coCoding is used. - */ - public final double coCodePercentage; - - /** - * Valid Compressions List, containing the ColGroup CompressionTypes that are allowed to be used for the compression - * Default is to always allow for Uncompromisable ColGroup. - */ - public final EnumSet validCompressions; - - /** The minimum size of the sample extracted. */ - public final int minimumSampleSize; - - /** The maximum size of the sample extracted. */ - public final int maxSampleSize; - - /** The sample type used for sampling */ - public final EstimationType estimationType; - - /** - * Transpose input matrix, to optimize access when extracting bitmaps. This setting is changed inside the script - * based on the transposeInput setting. - * - * This is intentionally left as a mutable value, since the transposition of the input matrix is decided in phase 3. - */ - public boolean transposed = false; - - /** The minimum compression ratio to achieve. */ - public final double minimumCompressionRatio; - - /** Is a spark instruction */ - public final boolean isInSparkInstruction; - - /** The sorting type used in sorting/joining offsets to create SDC groups */ - public final SORT_TYPE sdcSortType; - - /** if the settings have been logged already. */ - public static boolean printedStatus = false; - - public final double[] scaleFactors; - -<<<<<<< HEAD - public CompressionSettings(double samplingRatio, double samplePower, boolean allowSharedDictionary, - String transposeInput, int seed, boolean lossy, EnumSet validCompressions, - boolean sortValuesByLength, PartitionerType columnPartitioner, int maxColGroupCoCode, double coCodePercentage, - int minimumSampleSize, int maxSampleSize, EstimationType estimationType, CostType costComputationType, - double minimumCompressionRatio, boolean isInSparkInstruction, SORT_TYPE sdcSortType, double[] scaleFactors) { -======= - public final boolean preferDeltaEncoding; - - protected CompressionSettings(double samplingRatio, double samplePower, boolean allowSharedDictionary, - String transposeInput, int seed, boolean lossy, EnumSet validCompressions, - boolean sortValuesByLength, PartitionerType columnPartitioner, int maxColGroupCoCode, double coCodePercentage, - int minimumSampleSize, int maxSampleSize, EstimationType estimationType, CostType costComputationType, - double minimumCompressionRatio, boolean isInSparkInstruction, SORT_TYPE sdcSortType, double[] scaleFactors, - boolean preferDeltaEncoding) { ->>>>>>> upstream/main - this.samplingRatio = samplingRatio; - this.samplePower = samplePower; - this.allowSharedDictionary = allowSharedDictionary; - this.transposeInput = transposeInput; - this.seed = seed == -1 ? (int) System.nanoTime() : seed; - this.validCompressions = validCompressions; - this.lossy = lossy; - this.sortTuplesByFrequency = sortValuesByLength; - this.columnPartitioner = columnPartitioner; - this.maxColGroupCoCode = maxColGroupCoCode; - this.coCodePercentage = coCodePercentage; - this.minimumSampleSize = minimumSampleSize; - this.maxSampleSize = maxSampleSize; - this.estimationType = estimationType; - this.costComputationType = costComputationType; - this.minimumCompressionRatio = minimumCompressionRatio; - this.isInSparkInstruction = isInSparkInstruction; - this.sdcSortType = sdcSortType; - this.scaleFactors = scaleFactors; - this.preferDeltaEncoding = preferDeltaEncoding; - - if(!printedStatus && LOG.isDebugEnabled()) { - printedStatus = true; - LOG.debug(this.toString()); - } - } - - public boolean isRLEAllowed() { - return this.validCompressions.contains(CompressionType.RLE); - } - - @Override - public String toString() { - StringBuilder sb = new StringBuilder(); - sb.append("CompressionSettings: "); - sb.append("\t Valid Compressions: " + validCompressions); - sb.append("\t Share dict: " + allowSharedDictionary); - sb.append("\t Partitioner: " + columnPartitioner); - sb.append("\t Lossy: " + lossy); - sb.append("\t Cost Computation Type: " + costComputationType); - if(samplingRatio < 1.0) - sb.append("\t Estimation Type: " + estimationType); - return sb.toString(); - } - - -} + + protected CompressionSettings(double samplingRatio, double samplePower, boolean allowSharedDictionary, + String transposeInput, int seed, boolean lossy, EnumSet validCompressions, + boolean sortValuesByLength, PartitionerType columnPartitioner, int maxColGroupCoCode, double coCodePercentage, + int minimumSampleSize, int maxSampleSize, EstimationType estimationType, CostType costComputationType, + double minimumCompressionRatio, boolean isInSparkInstruction, SORT_TYPE sdcSortType, double[] scaleFactors, + boolean preferDeltaEncoding) { + this.samplingRatio = samplingRatio; + this.samplePower = samplePower; + this.allowSharedDictionary = allowSharedDictionary; + this.transposeInput = transposeInput; + this.seed = seed == -1 ? (int) System.nanoTime() : seed; + this.validCompressions = validCompressions; + this.lossy = lossy; + this.sortTuplesByFrequency = sortValuesByLength; + this.columnPartitioner = columnPartitioner; + this.maxColGroupCoCode = maxColGroupCoCode; + this.coCodePercentage = coCodePercentage; + this.minimumSampleSize = minimumSampleSize; + this.maxSampleSize = maxSampleSize; + this.estimationType = estimationType; + this.costComputationType = costComputationType; + this.minimumCompressionRatio = minimumCompressionRatio; + this.isInSparkInstruction = isInSparkInstruction; + this.sdcSortType = sdcSortType; + this.scaleFactors = scaleFactors; + this.preferDeltaEncoding = preferDeltaEncoding; + + if(!printedStatus && LOG.isDebugEnabled()) { + printedStatus = true; + LOG.debug(this.toString()); + } + } + + public boolean isRLEAllowed() { + return this.validCompressions.contains(CompressionType.RLE); + } + + @Override + public String toString() { + StringBuilder sb = new StringBuilder(); + sb.append("CompressionSettings: "); + sb.append("\t Valid Compressions: " + validCompressions); + sb.append("\t Share dict: " + allowSharedDictionary); + sb.append("\t Partitioner: " + columnPartitioner); + sb.append("\t Lossy: " + lossy); + sb.append("\t Cost Computation Type: " + costComputationType); + if(samplingRatio < 1.0) + sb.append("\t Estimation Type: " + estimationType); + return sb.toString(); + } +} \ No newline at end of file diff --git a/src/main/java/org/apache/sysds/runtime/compress/CompressionSettingsBuilder.java b/src/main/java/org/apache/sysds/runtime/compress/CompressionSettingsBuilder.java index df618f44f20..9af1b5aff2e 100644 --- a/src/main/java/org/apache/sysds/runtime/compress/CompressionSettingsBuilder.java +++ b/src/main/java/org/apache/sysds/runtime/compress/CompressionSettingsBuilder.java @@ -34,343 +34,332 @@ * Builder pattern for Compression Settings. See CompressionSettings for details on values. */ public class CompressionSettingsBuilder { - private double samplingRatio; - private double samplePower = 0.65; - private boolean allowSharedDictionary = false; - private String transposeInput; - private int seed = -1; - private boolean lossy = false; - private EnumSet validCompressions; - private boolean sortValuesByLength = true; - private int maxColGroupCoCode = 10000; - private double coCodePercentage = 0.01; - private int minimumSampleSize = 3000; - private int maxSampleSize = 1000000; - private EstimationType estimationType = EstimationType.HassAndStokes; - private PartitionerType columnPartitioner; - private CostType costType; - private double minimumCompressionRatio = 1.0; - private boolean isInSparkInstruction = false; - private SORT_TYPE sdcSortType = SORT_TYPE.MATERIALIZE; - private double[] scaleFactors = null; - private boolean preferDeltaEncoding = false; + private double samplingRatio; + private double samplePower = 0.65; + private boolean allowSharedDictionary = false; + private String transposeInput; + private int seed = -1; + private boolean lossy = false; + private EnumSet validCompressions; + private boolean sortValuesByLength = true; + private int maxColGroupCoCode = 10000; + private double coCodePercentage = 0.01; + private int minimumSampleSize = 3000; + private int maxSampleSize = 1000000; + private EstimationType estimationType = EstimationType.HassAndStokes; + private PartitionerType columnPartitioner; + private CostType costType; + private double minimumCompressionRatio = 1.0; + private boolean isInSparkInstruction = false; + private SORT_TYPE sdcSortType = SORT_TYPE.MATERIALIZE; + private double[] scaleFactors = null; + private boolean preferDeltaEncoding = false; public CompressionSettingsBuilder() { - DMLConfig conf = ConfigurationManager.getDMLConfig(); - this.lossy = conf.getBooleanValue(DMLConfig.COMPRESSED_LOSSY); - this.validCompressions = EnumSet.of(CompressionType.UNCOMPRESSED, CompressionType.CONST, CompressionType.EMPTY); - String[] validCompressionsString = conf.getTextValue(DMLConfig.COMPRESSED_VALID_COMPRESSIONS).split(","); - for(String comp : validCompressionsString) - validCompressions.add(CompressionType.valueOf(comp)); - samplingRatio = conf.getDoubleValue(DMLConfig.COMPRESSED_SAMPLING_RATIO); - columnPartitioner = PartitionerType.valueOf(conf.getTextValue(DMLConfig.COMPRESSED_COCODE)); - costType = CostType.valueOf(conf.getTextValue(DMLConfig.COMPRESSED_COST_MODEL)); - transposeInput = conf.getTextValue(DMLConfig.COMPRESSED_TRANSPOSE); - seed = DMLScript.SEED; + DMLConfig conf = ConfigurationManager.getDMLConfig(); + this.lossy = conf.getBooleanValue(DMLConfig.COMPRESSED_LOSSY); + this.validCompressions = EnumSet.of(CompressionType.UNCOMPRESSED, CompressionType.CONST, CompressionType.EMPTY); + String[] validCompressionsString = conf.getTextValue(DMLConfig.COMPRESSED_VALID_COMPRESSIONS).split(","); + for(String comp : validCompressionsString) + validCompressions.add(CompressionType.valueOf(comp)); + samplingRatio = conf.getDoubleValue(DMLConfig.COMPRESSED_SAMPLING_RATIO); + columnPartitioner = PartitionerType.valueOf(conf.getTextValue(DMLConfig.COMPRESSED_COCODE)); + costType = CostType.valueOf(conf.getTextValue(DMLConfig.COMPRESSED_COST_MODEL)); + transposeInput = conf.getTextValue(DMLConfig.COMPRESSED_TRANSPOSE); + seed = DMLScript.SEED; - } + } - /** - * Sets the scale factors for compression, enabling quantization-fused compression. - * - * @param scaleFactors An array of scale factors applied during compression. - * - If row-wise scaling is used, this should be an array where each value corresponds to a row. - * - If a single scalar is provided, it is applied uniformly to the entire matrix. - * @return The CompressionSettingsBuilder instance with the updated scale factors. - */ - public CompressionSettingsBuilder setScaleFactor(double[] scaleFactors) { - this.scaleFactors = scaleFactors; - return this; - } - - /** - * Copy the settings from another CompressionSettings Builder, modifies this, not that. - * - * @param that The other CompressionSettingsBuilder to copy settings from. - * @return The modified CompressionSettings in the same object. - */ - public CompressionSettingsBuilder copySettings(CompressionSettings that) { - this.samplingRatio = that.samplingRatio; - this.allowSharedDictionary = that.allowSharedDictionary; - this.transposeInput = that.transposeInput; - this.seed = that.seed; - this.lossy = that.lossy; - this.validCompressions = EnumSet.copyOf(that.validCompressions); - this.sortValuesByLength = that.sortTuplesByFrequency; - this.columnPartitioner = that.columnPartitioner; - this.maxColGroupCoCode = that.maxColGroupCoCode; - this.coCodePercentage = that.coCodePercentage; - this.minimumSampleSize = that.minimumSampleSize; - this.preferDeltaEncoding = that.preferDeltaEncoding; - return this; - } - - /** - * Set the Compression to use Lossy compression. - * - * @param lossy A boolean specifying if the compression should be lossy - * @return The CompressionSettingsBuilder - */ - public CompressionSettingsBuilder setLossy(boolean lossy) { - this.lossy = lossy; - return this; - } - - /** - * Set the sampling ratio in percent to sample the input matrix. Input value should be in range 0.0 - 1.0 - * - * @param samplingRatio The ratio to sample from the input - * @return The CompressionSettingsBuilder - */ - public CompressionSettingsBuilder setSamplingRatio(double samplingRatio) { - this.samplingRatio = samplingRatio; - return this; - } + /** + * Sets the scale factors for compression, enabling quantization-fused compression. + * + * @param scaleFactors An array of scale factors applied during compression. + * - If row-wise scaling is used, this should be an array where each value corresponds to a row. + * - If a single scalar is provided, it is applied uniformly to the entire matrix. + * @return The CompressionSettingsBuilder instance with the updated scale factors. + */ + public CompressionSettingsBuilder setScaleFactor(double[] scaleFactors) { + this.scaleFactors = scaleFactors; + return this; + } - /** - * Set the sortValuesByLength flag. This sorts the dictionaries containing the data based on their occurences in the - * ColGroup. Improving cache efficiency especially for diverse column groups. - * - * @param sortValuesByLength A boolean specifying if the values should be sorted - * @return The CompressionSettingsBuilder - */ - public CompressionSettingsBuilder setSortValuesByLength(boolean sortValuesByLength) { - this.sortValuesByLength = sortValuesByLength; - return this; - } + /** + * Copy the settings from another CompressionSettings Builder, modifies this, not that. + * + * @param that The other CompressionSettingsBuilder to copy settings from. + * @return The modified CompressionSettings in the same object. + */ + public CompressionSettingsBuilder copySettings(CompressionSettings that) { + this.samplingRatio = that.samplingRatio; + this.allowSharedDictionary = that.allowSharedDictionary; + this.transposeInput = that.transposeInput; + this.seed = that.seed; + this.lossy = that.lossy; + this.validCompressions = EnumSet.copyOf(that.validCompressions); + this.sortValuesByLength = that.sortTuplesByFrequency; + this.columnPartitioner = that.columnPartitioner; + this.maxColGroupCoCode = that.maxColGroupCoCode; + this.coCodePercentage = that.coCodePercentage; + this.minimumSampleSize = that.minimumSampleSize; + this.preferDeltaEncoding = that.preferDeltaEncoding; + return this; + } - /** - * Allow the Dictionaries to be shared between different column groups. - * - * @param allowSharedDictionary A boolean specifying if the dictionary can be shared between column groups. - * @return The CompressionSettingsBuilder - */ - public CompressionSettingsBuilder setAllowSharedDictionary(boolean allowSharedDictionary) { - this.allowSharedDictionary = allowSharedDictionary; - return this; - } + /** + * Set the Compression to use Lossy compression. + * + * @param lossy A boolean specifying if the compression should be lossy + * @return The CompressionSettingsBuilder + */ + public CompressionSettingsBuilder setLossy(boolean lossy) { + this.lossy = lossy; + return this; + } - /** - * Specify if the input matrix should be transposed before compression. This improves cache efficiency while - * compression the input matrix - * - * @param transposeInput string specifying if the input should be transposed before compression, should be one of - * "auto", "true" or "false" - * @return The CompressionSettingsBuilder - */ - public CompressionSettingsBuilder setTransposeInput(String transposeInput) { - switch(transposeInput) { - case "auto": - case "true": - case "false": - this.transposeInput = transposeInput; - break; - default: - throw new DMLCompressionException("Invalid transpose technique"); - } - return this; - } + /** + * Set the sampling ratio in percent to sample the input matrix. Input value should be in range 0.0 - 1.0 + * + * @param samplingRatio The ratio to sample from the input + * @return The CompressionSettingsBuilder + */ + public CompressionSettingsBuilder setSamplingRatio(double samplingRatio) { + this.samplingRatio = samplingRatio; + return this; + } - /** - * Set the seed for the compression operation. - * - * @param seed The seed used in sampling the matrix and general operations in the compression. - * @return The CompressionSettingsBuilder - */ - public CompressionSettingsBuilder setSeed(int seed) { - this.seed = seed; - return this; - } + /** + * Set the sortValuesByLength flag. This sorts the dictionaries containing the data based on their occurences in the + * ColGroup. Improving cache efficiency especially for diverse column groups. + * + * @param sortValuesByLength A boolean specifying if the values should be sorted + * @return The CompressionSettingsBuilder + */ + public CompressionSettingsBuilder setSortValuesByLength(boolean sortValuesByLength) { + this.sortValuesByLength = sortValuesByLength; + return this; + } - /** - * Set the valid compression strategies used for the compression. - * - * @param validCompressions An EnumSet of CompressionTypes to use in the compression - * @return The CompressionSettingsBuilder - */ - public CompressionSettingsBuilder setValidCompressions(EnumSet validCompressions) { - // should always contain Uncompressed as an option. - if(!validCompressions.contains(CompressionType.UNCOMPRESSED)) - validCompressions.add(CompressionType.UNCOMPRESSED); - if(!validCompressions.contains(CompressionType.CONST)) - validCompressions.add(CompressionType.CONST); - if(!validCompressions.contains(CompressionType.EMPTY)) - validCompressions.add(CompressionType.EMPTY); - this.validCompressions = validCompressions; - return this; - } + /** + * Allow the Dictionaries to be shared between different column groups. + * + * @param allowSharedDictionary A boolean specifying if the dictionary can be shared between column groups. + * @return The CompressionSettingsBuilder + */ + public CompressionSettingsBuilder setAllowSharedDictionary(boolean allowSharedDictionary) { + this.allowSharedDictionary = allowSharedDictionary; + return this; + } - /** - * Add a single valid compression type to the EnumSet of valid compressions. - * - * @param cp The compression type to add to the valid ones. - * @return The CompressionSettingsBuilder - */ - public CompressionSettingsBuilder addValidCompression(CompressionType cp) { - this.validCompressions.add(cp); - return this; - } + /** + * Specify if the input matrix should be transposed before compression. This improves cache efficiency while + * compression the input matrix + * + * @param transposeInput string specifying if the input should be transposed before compression, should be one of + * "auto", "true" or "false" + * @return The CompressionSettingsBuilder + */ + public CompressionSettingsBuilder setTransposeInput(String transposeInput) { + switch(transposeInput) { + case "auto": + case "true": + case "false": + this.transposeInput = transposeInput; + break; + default: + throw new DMLCompressionException("Invalid transpose technique"); + } + return this; + } /** - * Ziel-Gesamtverlust für piecewise lineare Kompression. - * Interpretation: maximal erlaubter globaler MSE pro Wert in der Spalte. - * 0.0 ~ quasi verlustfrei, viele Segmente - * >0 ~ mehr Approximation erlaubt, weniger Segmente + * Set the seed for the compression operation. + * + * @param seed The seed used in sampling the matrix and general operations in the compression. + * @return The CompressionSettingsBuilder + */ + public CompressionSettingsBuilder setSeed(int seed) { + this.seed = seed; + return this; + } + /** + * Set the valid compression strategies used for the compression. + * + * @param validCompressions An EnumSet of CompressionTypes to use in the compression + * @return The CompressionSettingsBuilder + */ + public CompressionSettingsBuilder setValidCompressions(EnumSet validCompressions) { + // should always contain Uncompressed as an option. + if(!validCompressions.contains(CompressionType.UNCOMPRESSED)) + validCompressions.add(CompressionType.UNCOMPRESSED); + if(!validCompressions.contains(CompressionType.CONST)) + validCompressions.add(CompressionType.CONST); + if(!validCompressions.contains(CompressionType.EMPTY)) + validCompressions.add(CompressionType.EMPTY); + this.validCompressions = validCompressions; + return this; + } - public void setPiecewiseTargetLoss(double piecewiseTargetLoss) { - this.piecewiseTargetLoss = piecewiseTargetLoss; - }*/ + /** + * Add a single valid compression type to the EnumSet of valid compressions. + * + * @param cp The compression type to add to the valid ones. + * @return The CompressionSettingsBuilder + */ + public CompressionSettingsBuilder addValidCompression(CompressionType cp) { + this.validCompressions.add(cp); + return this; + } - /** - * Clear all the compression types allowed in the compression. This will only allow the Uncompressed ColGroup type. - * Since this is required for operation of the compression - * - * @return The CompressionSettingsBuilder - */ - public CompressionSettingsBuilder clearValidCompression() { - this.validCompressions = EnumSet.of(CompressionType.UNCOMPRESSED, CompressionType.EMPTY, CompressionType.CONST); - return this; - } + /** + * Clear all the compression types allowed in the compression. This will only allow the Uncompressed ColGroup type. + * Since this is required for operation of the compression + * + * @return The CompressionSettingsBuilder + */ + public CompressionSettingsBuilder clearValidCompression() { + this.validCompressions = EnumSet.of(CompressionType.UNCOMPRESSED, CompressionType.EMPTY, CompressionType.CONST); + return this; + } - /** - * Set the type of CoCoding Partitioner type to use for combining columns together. - * - * @param columnPartitioner The Strategy to select from PartitionerType - * @return The CompressionSettingsBuilder - */ - public CompressionSettingsBuilder setColumnPartitioner(PartitionerType columnPartitioner) { - this.columnPartitioner = columnPartitioner; - return this; - } + /** + * Set the type of CoCoding Partitioner type to use for combining columns together. + * + * @param columnPartitioner The Strategy to select from PartitionerType + * @return The CompressionSettingsBuilder + */ + public CompressionSettingsBuilder setColumnPartitioner(PartitionerType columnPartitioner) { + this.columnPartitioner = columnPartitioner; + return this; + } - /** - * Set the maximum number of columns to CoCode together in the CoCoding strategy. Compression time increase with - * higher numbers. - * - * @param maxColGroupCoCode The max selected. - * @return The CompressionSettingsBuilder - */ - public CompressionSettingsBuilder setMaxColGroupCoCode(int maxColGroupCoCode) { - this.maxColGroupCoCode = maxColGroupCoCode; - return this; - } + /** + * Set the maximum number of columns to CoCode together in the CoCoding strategy. Compression time increase with + * higher numbers. + * + * @param maxColGroupCoCode The max selected. + * @return The CompressionSettingsBuilder + */ + public CompressionSettingsBuilder setMaxColGroupCoCode(int maxColGroupCoCode) { + this.maxColGroupCoCode = maxColGroupCoCode; + return this; + } - /** - * Set the coCode percentage, the effect is different based on the coCoding strategy, but the general effect is that - * higher values results in more coCoding while lower values result in less. - * - * Note that with high coCoding the compression ratio would possibly be lower. - * - * @param coCodePercentage The percentage to set. - * @return The CompressionSettingsBuilder - */ - public CompressionSettingsBuilder setCoCodePercentage(double coCodePercentage) { - this.coCodePercentage = coCodePercentage; - return this; - } + /** + * Set the coCode percentage, the effect is different based on the coCoding strategy, but the general effect is that + * higher values results in more coCoding while lower values result in less. + * + * Note that with high coCoding the compression ratio would possibly be lower. + * + * @param coCodePercentage The percentage to set. + * @return The CompressionSettingsBuilder + */ + public CompressionSettingsBuilder setCoCodePercentage(double coCodePercentage) { + this.coCodePercentage = coCodePercentage; + return this; + } - /** - * Set the minimum sample size to extract from a given matrix, this overrules the sample percentage if the sample - * percentage extracted is lower than this minimum bound. - * - * @param minimumSampleSize The minimum sample size to extract - * @return The CompressionSettingsBuilder - */ - public CompressionSettingsBuilder setMinimumSampleSize(int minimumSampleSize) { - this.minimumSampleSize = minimumSampleSize; - return this; - } + /** + * Set the minimum sample size to extract from a given matrix, this overrules the sample percentage if the sample + * percentage extracted is lower than this minimum bound. + * + * @param minimumSampleSize The minimum sample size to extract + * @return The CompressionSettingsBuilder + */ + public CompressionSettingsBuilder setMinimumSampleSize(int minimumSampleSize) { + this.minimumSampleSize = minimumSampleSize; + return this; + } - /** - * Set the maximum sample size to extract from a given matrix, this overrules the sample percentage if the sample - * percentage extracted is higher than this maximum bound. - * - * @param maxSampleSize The maximum sample size to extract - * @return The CompressionSettingsBuilder - */ - public CompressionSettingsBuilder setMaxSampleSize(int maxSampleSize) { - this.maxSampleSize = maxSampleSize; - return this; - } + /** + * Set the maximum sample size to extract from a given matrix, this overrules the sample percentage if the sample + * percentage extracted is higher than this maximum bound. + * + * @param maxSampleSize The maximum sample size to extract + * @return The CompressionSettingsBuilder + */ + public CompressionSettingsBuilder setMaxSampleSize(int maxSampleSize) { + this.maxSampleSize = maxSampleSize; + return this; + } - /** - * Set the estimation type used for the sampled estimates. - * - * @param estimationType the estimation type in used. - * @return The CompressionSettingsBuilder - */ - public CompressionSettingsBuilder setEstimationType(EstimationType estimationType) { - this.estimationType = estimationType; - return this; - } + /** + * Set the estimation type used for the sampled estimates. + * + * @param estimationType the estimation type in used. + * @return The CompressionSettingsBuilder + */ + public CompressionSettingsBuilder setEstimationType(EstimationType estimationType) { + this.estimationType = estimationType; + return this; + } - /** - * Set the cost type used for estimating the cost of column groups default is memory based. - * - * @param costType The Cost type wanted - * @return The CompressionSettingsBuilder - */ - public CompressionSettingsBuilder setCostType(CostType costType) { - this.costType = costType; - return this; - } + /** + * Set the cost type used for estimating the cost of column groups default is memory based. + * + * @param costType The Cost type wanted + * @return The CompressionSettingsBuilder + */ + public CompressionSettingsBuilder setCostType(CostType costType) { + this.costType = costType; + return this; + } - /** - * Set the minimum compression ratio to be achieved by the compression. - * - * @param ratio The ratio to achieve while compressing - * @return The CompressionSettingsBuilder - */ - public CompressionSettingsBuilder setMinimumCompressionRatio(double ratio) { - this.minimumCompressionRatio = ratio; - return this; - } + /** + * Set the minimum compression ratio to be achieved by the compression. + * + * @param ratio The ratio to achieve while compressing + * @return The CompressionSettingsBuilder + */ + public CompressionSettingsBuilder setMinimumCompressionRatio(double ratio) { + this.minimumCompressionRatio = ratio; + return this; + } - /** - * Inform the compression that it is executed in a spark instruction. - * - * @return The CompressionSettingsBuilder - */ - public CompressionSettingsBuilder setIsInSparkInstruction() { - this.isInSparkInstruction = true; - return this; - } + /** + * Inform the compression that it is executed in a spark instruction. + * + * @return The CompressionSettingsBuilder + */ + public CompressionSettingsBuilder setIsInSparkInstruction() { + this.isInSparkInstruction = true; + return this; + } - /** - * Set the sort type to use. - * - * @param sdcSortType The sort type for the construction of SDC groups - * @return The CompressionSettingsBuilder - */ - public CompressionSettingsBuilder setSDCSortType(SORT_TYPE sdcSortType) { - this.sdcSortType = sdcSortType; - return this; - } + /** + * Set the sort type to use. + * + * @param sdcSortType The sort type for the construction of SDC groups + * @return The CompressionSettingsBuilder + */ + public CompressionSettingsBuilder setSDCSortType(SORT_TYPE sdcSortType) { + this.sdcSortType = sdcSortType; + return this; + } - /** - * Set whether to prefer delta encoding during compression estimation. - * When enabled, the compression estimator will use delta encoding statistics - * instead of regular encoding statistics. - * - * @param preferDeltaEncoding Whether to prefer delta encoding - * @return The CompressionSettingsBuilder - */ - public CompressionSettingsBuilder setPreferDeltaEncoding(boolean preferDeltaEncoding) { - this.preferDeltaEncoding = preferDeltaEncoding; - return this; - } + /** + * Set whether to prefer delta encoding during compression estimation. + * When enabled, the compression estimator will use delta encoding statistics + * instead of regular encoding statistics. + * + * @param preferDeltaEncoding Whether to prefer delta encoding + * @return The CompressionSettingsBuilder + */ + public CompressionSettingsBuilder setPreferDeltaEncoding(boolean preferDeltaEncoding) { + this.preferDeltaEncoding = preferDeltaEncoding; + return this; + } - /** - * Create the CompressionSettings object to use in the compression. - * - * @return The CompressionSettings - */ - public CompressionSettings create() { - return new CompressionSettings(samplingRatio, samplePower, allowSharedDictionary, transposeInput, seed, lossy, - validCompressions, sortValuesByLength, columnPartitioner, maxColGroupCoCode, coCodePercentage, - minimumSampleSize, maxSampleSize, estimationType, costType, minimumCompressionRatio, isInSparkInstruction, - sdcSortType, scaleFactors, preferDeltaEncoding); - } -} + /** + * Create the CompressionSettings object to use in the compression. + * + * @return The CompressionSettings + */ + public CompressionSettings create() { + return new CompressionSettings(samplingRatio, samplePower, allowSharedDictionary, transposeInput, seed, lossy, + validCompressions, sortValuesByLength, columnPartitioner, maxColGroupCoCode, coCodePercentage, + minimumSampleSize, maxSampleSize, estimationType, costType, minimumCompressionRatio, isInSparkInstruction, + sdcSortType, scaleFactors, preferDeltaEncoding); + } +} \ No newline at end of file diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupFactory.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupFactory.java index aa11a0c00a0..49901004ff0 100644 --- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupFactory.java +++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupFactory.java @@ -1080,7 +1080,8 @@ public static AColGroup compressPiecewiseLinearFunctional(IColIndex colIndexes, // Breakpoints bestimmen: Einteilung der Segmente - List breakpointsList = computeBreakpoints(cs, column); + double targetLoss = 1e-3; + List breakpointsList = computeBreakpoints(cs, column,targetLoss); int[] breakpoints = breakpointsList.stream().mapToInt(Integer::intValue).toArray(); //Für jedes Segment lineare Regression als kompressionsverfahren @@ -1117,10 +1118,9 @@ public static double[] getColumn(MatrixBlock in, int colIndex) { } return column; } - public static List computeBreakpoints(CompressionSettings cs, double[] column){ + public static List computeBreakpoints(CompressionSettings cs, double[] column, double targetloss){ int n = column.length; - double targetMSE = cs.getPiecewiseTargetLoss(); // nur lesen, NICHT setzen! - + double targetMSE = targetloss; // Fall A: kein TargetLoss angegeben -> einfache Variante mit fixem λ if (Double.isNaN(targetMSE) || targetMSE <= 0) { double lambda = 5.0; diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/scheme/ColGroupPiecewiseLinearCompressed.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/scheme/ColGroupPiecewiseLinearCompressed.java index e9e4cd1572b..af4b6dad172 100644 --- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/scheme/ColGroupPiecewiseLinearCompressed.java +++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/scheme/ColGroupPiecewiseLinearCompressed.java @@ -10,7 +10,7 @@ import org.apache.sysds.runtime.data.SparseBlock; import org.apache.sysds.runtime.data.SparseBlockMCSR; import org.apache.sysds.runtime.functionobjects.Builtin; -import org.apache.sysds.runtime.instructions.cp.CM_COV_Object; +import org.apache.sysds.runtime.instructions.cp.CmCovObject; import org.apache.sysds.runtime.matrix.data.MatrixBlock; import org.apache.sysds.runtime.matrix.operators.BinaryOperator; import org.apache.sysds.runtime.matrix.operators.CMOperator; @@ -294,10 +294,11 @@ public void computeColSums(double[] c, int nRows) { } @Override - public CM_COV_Object centralMoment(CMOperator op, int nRows) { + public CmCovObject centralMoment(CMOperator op, int nRows) { return null; } + @Override public AColGroup rexpandCols(int max, boolean ignore, boolean cast, int nRows) { return null; diff --git a/src/test/java/org/apache/sysds/runtime/compress/colgroup/ColGroupPiecewiseLinearCompressedTest.java b/src/test/java/org/apache/sysds/runtime/compress/colgroup/ColGroupPiecewiseLinearCompressedTest.java index 5a740624d2d..b41155c0e49 100644 --- a/src/test/java/org/apache/sysds/runtime/compress/colgroup/ColGroupPiecewiseLinearCompressedTest.java +++ b/src/test/java/org/apache/sysds/runtime/compress/colgroup/ColGroupPiecewiseLinearCompressedTest.java @@ -3,17 +3,19 @@ import org.apache.sysds.runtime.compress.CompressionSettings; import org.apache.sysds.runtime.compress.CompressionSettingsBuilder; import org.apache.sysds.runtime.compress.colgroup.indexes.ArrayIndex; +import org.apache.sysds.runtime.compress.colgroup.indexes.ColIndexFactory; import org.apache.sysds.runtime.compress.colgroup.indexes.IColIndex; import org.apache.sysds.runtime.compress.colgroup.scheme.ColGroupPiecewiseLinearCompressed; import org.apache.sysds.runtime.compress.colgroup.ColGroupFactory; +import org.apache.sysds.runtime.matrix.data.MatrixBlock; import org.junit.Test; import org.junit.jupiter.api.BeforeEach; import java.util.Arrays; import java.util.List; -import static org.apache.sysds.runtime.compress.colgroup.ColGroupFactory.computeBreakpoints; +import static org.apache.sysds.runtime.compress.colgroup.ColGroupFactory.*; import static org.junit.Assert.*; /** @@ -37,36 +39,388 @@ void setUp() { @Test public void testComputeBreakpoints_uniformColumn() { - cs.setPiecewiseTargetLoss(1e-3); + //cs.setPiecewiseTargetLoss(1e-3); double[] column = {1.0, 1.0, 1.0, 1.0, 1.0}; // ← Test-spezifisch - List breaks = computeBreakpoints(cs, column); - assertEquals(Arrays.asList(0), breaks); // Erwartet: keine Breaks + List breaks = computeBreakpoints(cs, column,1e-3); + assertEquals(Arrays.asList(0,5), breaks); // Erwartet: keine Breaks } @Test public void testComputeBreakpoints_linearIncreasing() { - cs.setPiecewiseTargetLoss(1e-3); + //cs.setPiecewiseTargetLoss(1e-3); double[] column = {0.0, 1.0, 2.0, 3.0, 4.0}; // ← andere column - List breaks = computeBreakpoints(cs, column); - assertEquals(Arrays.asList(0, 2), breaks); // Erwartet + List breaks = computeBreakpoints(cs, column,1e-3); + assertEquals(Arrays.asList(0, 5), breaks); // Erwartet } @Test public void testComputeBreakpoints_highLoss_uniform() { - cs.setPiecewiseTargetLoss(1.0); // ← andere Loss + //cs.setPiecewiseTargetLoss(1.0); // ← andere Loss double[] column = {1.0, 1.0, 1.0, 1.0, 1.0}; - List breaks = computeBreakpoints(cs, column); - assertEquals(Arrays.asList(0), breaks); + List breaks = computeBreakpoints(cs, column,10000.0); + assertEquals(Arrays.asList(0,5), breaks); + } + @Test + public void testComputeBreakpoints_twoSegments() { + // {1,1,1, 2,2,2} → 2 Segmente → [0,3,6] + double[] column = {1.0, 1.0, 1.0, 2.0, 2.0, 2.0}; + var breaks = computeBreakpoints(cs, column, 1e-3); + assertEquals(Arrays.asList(0, 3, 6), breaks); } @Test public void testComputeBreakpoints_noLoss_linear() { - cs.setPiecewiseTargetLoss(0.0); + //cs.setPiecewiseTargetLoss(0.0); double[] column = {0.0, 1.0, 2.0, 3.0, 4.0}; - List breaks = computeBreakpoints(cs, column); - assertEquals(Arrays.asList(0, 1, 2, 3), breaks); // bei 0 Loss alle Breaks + List breaks = computeBreakpoints(cs, column,0.0); + assertEquals(Arrays.asList(0,5), breaks); // bei 0 Loss alle Breaks + } + @Test + public void testComputeBreakpointsLambda_const() { + double[] column = {1.0, 1.0, 1.0, 1.0, 1.0}; // 5 Werte + List breaks = computeBreakpointsLambda(column, 5.0); + assertEquals(Arrays.asList(0, 5), breaks); // 0 bis 5 + + breaks = computeBreakpointsLambda(column, 0.01); + assertEquals(Arrays.asList(0, 5), breaks); // auch mit kleinem lambda + } + @Test + public void testComputeBreakpointsLambda_twoSegments() { + double[] column = {1.0, 1.0, 1.0, 2.0, 2.0, 2.0}; // 6 Werte + + // mit kleinem lambda -> viele Segmente (kostenlos fast) + List breaks = computeBreakpointsLambda(column, 0.01); + assertTrue(breaks.contains(3)); // 3 muss als Grenze enthalten sein + assertEquals(3, breaks.size()); // 0, 3, 6 + assertEquals(Arrays.asList(0, 3, 6), breaks); + + // mit großem lambda -> nur ein Segment + breaks = computeBreakpointsLambda(column, 1000.0); + assertEquals(Arrays.asList(0, 6), breaks); + } + @Test + public void testComputeBreakpointsLambda_jumpWithTrend() { + double[] column = {0.0, 1.0, 2.0, 10.0, 11.0, 12.0}; + + // grobe Segmentanpassung: ein Segment pro „Abschnitt“ + List breaks = computeBreakpointsLambda(column, 0.5); + assertEquals(Arrays.asList(0, 3, 6), breaks); + + // nur ein Segment, wenn lambda sehr groß + breaks = computeBreakpointsLambda(column, 100.0); + assertEquals(Arrays.asList(0, 6), breaks); + } + + @Test + public void testComputeBreakpointsLambda_linear() { + double[] column = {0.0, 1.0, 2.0, 3.0, 4.0, 5.0}; + + List breaks = computeBreakpointsLambda(column, 1.0); + assertEquals(Arrays.asList(0, 6), breaks); + + // mit sehr kleinem lambda: wir prüfen nur, dass die Grenzen vernünftig sind + breaks = computeBreakpointsLambda(column, 0.001); + assertTrue(breaks.size() >= 2); + assertTrue(breaks.get(0) == 0); + assertTrue(breaks.get(breaks.size() - 1) == column.length); + } + @Test + public void testComputeBreakpointsLambda_edge_lambdaVerySmall() { + double[] column = {1.0, 1.1, 1.0, 1.1, 1.0}; + + List breaks = computeBreakpointsLambda(column, 0.001); + assertNotNull(breaks); + assertFalse(breaks.isEmpty()); + assertEquals(0, (int) breaks.get(0)); + assertEquals(column.length, (int) breaks.get(breaks.size() - 1)); + + // Prüfe, dass die Liste sortiert ist + for (int i = 1; i < breaks.size(); i++) { + assertTrue(breaks.get(i) >= breaks.get(i - 1)); + } + } + @Test + public void testComputeBreakpointsLambda_edge_lambdaVeryLarge() { + double[] column = {1.0, 2.0, 1.5, 2.5, 1.8}; + + List breaks = computeBreakpointsLambda(column, 1000.0); + assertEquals(Arrays.asList(0, 5), breaks); + } + @Test + public void testComputeSegmentCost_emptyOrSingle() { + double[] column = {10.0, 20.0, 30.0}; + + // 0 Elemente (leer) + assertEquals(0.0, computeSegmentCost(column, 0, 0), 1e-10); + assertEquals(0.0, computeSegmentCost(column, 1, 1), 1e-10); + + // 1 Element → Regressionsgerade ist nicht eindeutig definiert, aber SSE=0 + assertEquals(0.0, computeSegmentCost(column, 0, 1), 1e-10); + assertEquals(0.0, computeSegmentCost(column, 1, 2), 1e-10); + assertEquals(0.0, computeSegmentCost(column, 2, 3), 1e-10); } + @Test + public void testComputeSegmentCost_twoConstantPoints() { + double[] column = {5.0, 5.0, 1.0, 1.0}; + + // Zwei identische Punkte (konstant) → SSE = 0 + double sse = computeSegmentCost(column, 0, 2); + assertEquals(0.0, sse, 1e-10); + } + @Test + public void testComputeSegmentCost_twoDifferentPoints() { + double[] column = {0.0, 2.0, 1.0, 3.0}; + + // Zwei Punkte: (0,0) und (1,2) → Gerade y = 2*x, Fehler = 0 + double sse = computeSegmentCost(column, 0, 2); + assertEquals(0.0, sse, 1e-10); + + // Zwei Punkte: (2,1) und (3,3) → Gerade y = 2*x - 3, Fehler = 0 + sse = computeSegmentCost(column, 2, 4); + assertEquals(0.0, sse, 1e-10); + } + @Test + public void testComputeSegmentCost_constantThree() { + double[] column = {0.0, 0.0, 0.0}; + double sse = computeSegmentCost(column, 0, 3); + assertEquals(0.0, sse, 1e-10); + } + @Test + public void testComputeSegmentCost_consistent_with_regression() { + double[] column = {0.0, 2.0, 0.0, 4.0, 0.0, 6.0}; // 6 Punkte + + int start = 0, end = 3; + double[] ab = regressSegment(column, start, end); + double slope = ab[0], intercept = ab[1]; + double sse_hand = 0.0; + for (int i = start; i < end; i++) { + double yhat = slope * i + intercept; + double diff = column[i] - yhat; + sse_hand += diff * diff; + } + + double sse = computeSegmentCost(column, start, end); + assertEquals(sse_hand, sse, 1e-10); + } + @Test + public void testComputeTotalSSE_emptyBreaks() { + double[] column = {1.0, 2.0, 3.0}; + List breaks = Arrays.asList(); // leer → keine Segmente + double total = computeTotalSSE(column, breaks); + + // 0 Segmente → Summe über 0 Segmente = 0 + assertEquals(0.0, total, 1e-10); + } + @Test + public void testComputeTotalSSE_singleSegment_all() { + double[] column = {1.0, 2.0, 3.0}; + List breaks = Arrays.asList(0, 3); // ein Segment [0,3) + + double total = computeTotalSSE(column, breaks); + double expected = computeSegmentCost(column, 0, 3); + + // Ergebnis muss exakt das gleiche wie der SSE des gesamten Segments sein + assertEquals(expected, total, 1e-10); + } + @Test + public void testComputeTotalSSE_twoSegments() { + // Beispiel: [0,0,0] und [1,1,1] (jeweils konstant) + double[] column = {0.0, 0.0, 0.0, 1.0, 1.0, 1.0}; + List breaks = Arrays.asList(0, 3, 6); // zwei Segmente + + double total = computeTotalSSE(column, breaks); + double sse1 = computeSegmentCost(column, 0, 3); // [0,0,0] → SSE = 0 + double sse2 = computeSegmentCost(column, 3, 6); // [1,1,1] → SSE = 0 + + // da beide Segmente konstant sind, muss totalSSE = 0 sein + assertEquals(0.0, total, 1e-10); + assertEquals(sse1 + sse2, total, 1e-10); + } + @Test + public void testComputeTotalSSE_threeSegments() { + // Ein Segment mit drei identischen Werten, zwei Segmente mit jeweils zwei Werten + double[] column = {1.0, 1.0, 1.0, 2.0, 2.0, 3.0, 3.0}; + List breaks = Arrays.asList(0, 3, 5, 7); + + // Segment [0,3): konstant 1.0 → SSE = 0 + double sse1 = computeSegmentCost(column, 0, 3); // 0 + + // Segment [3,5): [2,2] → SSE = 0 + double sse2 = computeSegmentCost(column, 3, 5); // 0 + + // Segment [5,7): [3,3] → SSE = 0 + double sse3 = computeSegmentCost(column, 5, 7); // 0 + + double total = computeTotalSSE(column, breaks); + assertEquals(0.0, total, 1e-10); + assertEquals(sse1 + sse2 + sse3, total, 1e-10); + } + @Test + public void testComputeTotalSSE_gapStartEnd() { + double[] column = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0}; + List breaks = Arrays.asList(2, 5, 8); // Segmente [2,5), [5,8) + + double total = computeTotalSSE(column, breaks); + double sse1 = computeSegmentCost(column, 2, 5); + double sse2 = computeSegmentCost(column, 5, 8); + + // Resultat: Summe der zwei Segmente + assertEquals(sse1 + sse2, total, 1e-10); + + // Die Indizes <2 und >=8 sind nicht Teil der Segmente und fließen nicht in totalSSE ein + } + @Test + public void testComputeTotalSSE_oneSegment_identical() { + double[] column = {1.0, 2.0, 3.0, 4.0, 5.0}; + + // Vergleich: SSE des gesamten Segments über [0,5) + double sseTotal = computeSegmentCost(column, 0, 5); + + // Berechnung mit computeTotalSSE und breaks [0,5] + List breaks = Arrays.asList(0, 5); + double total = computeTotalSSE(column, breaks); + + // beide müssen exakt gleich sein + assertEquals(sseTotal, total, 1e-10); + } + @Test + public void testComputeTotalSSE_nonConstant() { + double[] column = {0.0, 1.0, 2.0, 3.0, 4.0}; + List breaks = Arrays.asList(0, 2, 5); // [0,2), [2,5) + + double total = computeTotalSSE(column, breaks); + double sse1 = computeSegmentCost(column, 0, 2); + double sse2 = computeSegmentCost(column, 2, 5); + + // Sanity-Check: Ergebnis positiv, Summe der beiden SSE + assertTrue(total >= 0.0); + assertEquals(sse1 + sse2, total, 1e-10); + } + @Test + public void testComputeTotalSSE_edgeCases() { + // Leere Spalte, Segmente [0,0] → kein Segment + double[] columnEmpty = {}; // length 0 + List breaksEmpty = Arrays.asList(0, 0); + assertEquals(0.0, computeTotalSSE(columnEmpty, breaksEmpty), 1e-10); + + // Spalte der Länge 1, ein Segment [0,1) + double[] columnOne = {42.0}; + List breaksOne = Arrays.asList(0, 1); + double total = computeTotalSSE(columnOne, breaksOne); + assertEquals(0.0, total, 1e-10); + } + @Test + public void testRegressSegment_empty() { + double[] column = {1.0, 2.0, 3.0}; + double[] result = regressSegment(column, 0, 0); // leer + assertEquals(0.0, result[0], 1e-10); + assertEquals(0.0, result[1], 1e-10); + } + @Test + public void testRegressSegment_singlePoint() { + double[] column = {1.0, 2.0, 3.0}; + double[] result = regressSegment(column, 1, 2); // nur i=1: y=2.0 + + assertEquals(0.0, result[0], 1e-10); // slope = 0 + assertEquals(2.0, result[1], 1e-10); // intercept = Mittelwert + } + @Test + public void testRegressSegment_twoIdentical() { + double[] column = {5.0, 5.0, 1.0, 1.0}; + double[] result = regressSegment(column, 0, 2); // i=0:5, i=1:5 + + // Steigung = 0, y = 5.0 + 0*i + assertEquals(0.0, result[0], 1e-10); + assertEquals(5.0, result[1], 1e-10); + } + @Test + public void testRegressSegment_twoPoints() { + double[] column = {0.0, 2.0}; // (i=0, y=0), (i=1, y=2) + double[] result = regressSegment(column, 0, 2); + + // Gerade durch (0,0) und (1,2) → y = 2*i + 0 + assertEquals(2.0, result[0], 1e-10); + assertEquals(0.0, result[1], 1e-10); + } + @Test + public void testRegressSegment_twoPoints_offset() { + // column[0], column[1], column[2], column[3] → es gibt 4 Werte + double[] column = {1.0, 3.0, 5.0, 7.0}; // z. B. y = 2*x + 1 → bei x=2: y=5, x=3: y=7 + double[] result = regressSegment(column, 2, 4); // Segment [2,4) → i=2,3 + + // Gerade durch (2,5), (3,7): slope = 2, intercept = 1 + assertEquals(2.0, result[0], 1e-10); + assertEquals(1.0, result[1], 1e-10); + } + @Test + public void testRegressSegment_constant() { + double[] column = {3.0, 3.0, 3.0, 3.0}; + double[] result = regressSegment(column, 0, 4); + + assertEquals(0.0, result[0], 1e-10); + assertEquals(3.0, result[1], 1e-10); + } + @Test + public void testRegressSegment_linear() { + double[] column = new double[4]; + double a = 1.5, b = 2.0; + for (int i = 0; i < 4; i++) { + column[i] = a * i + b; + } + + double[] result = regressSegment(column, 0, 4); + + // Exakt: slope = 1.5, intercept = 2.0 + assertEquals(a, result[0], 1e-10); + assertEquals(b, result[1], 1e-10); + } + @Test + public void testRegressSegment_denomZero() { + // fiktiv: ein Segment mit einem Punkt + double[] column = {10.0}; + double[] result = regressSegment(column, 0, 1); + + assertEquals(0.0, result[0], 1e-10); + assertEquals(10.0, result[1], 1e-10); + } + + @Test + public void testCompressPiecewiseLinearFunctional_const() { + // 1. MatrixBlock mit einer konstanten Spalte erzeugen + double[] data = {1.0, 1.0, 1.0, 1.0, 1.0}; // 5 Zeilen, 1 Spalte + MatrixBlock in = new MatrixBlock(5, 1, false).quickSetMatrix(data, 5); + + // 2. colIndexes für Spalte 0 + IColIndex colIndexes = ColIndexFactory.create(0); + + // 3. Aufruf der Kompressionsfunktion + AColGroup result = ColGroupFactory.compressPiecewiseLinearFunctional(colIndexes, in, new CompressionSettings()); + + // 4. Ergebnis ist eine ColGroupPiecewiseLinearCompressed? + assertTrue(result instanceof ColGroupPiecewiseLinearCompressed); + ColGroupPiecewiseLinearCompressed plGroup = (ColGroupPiecewiseLinearCompressed) result; + + // 5. Check Breakpoints: [0, 5] → ein Segment + int[] breakpoints = plGroup.c(); + assertArrayEquals(new int[] {0, 5}, breakpoints); + + // 6. Pro Segment: 1 Segment → ein slope, ein intercept + double[] slopes = plGroup.getSlopes(); + double[] intercepts = plGroup.getIntercepts(); + assertEquals(1, slopes.length); + assertEquals(1, intercepts.length); + + // 7. Für konstante Daten: Steigung ~0, intercept ~1.0 + assertEquals(0.0, slopes[0], 1e-10); + assertEquals(1.0, intercepts[0], 1e-10); // Mittelwert der Spalte + + // 8. Check: colIndexes stimmt + assertEquals(1, plGroup.getColIndex().size()); + assertEquals(0, plGroup.getColIndex().get(0)); + } + + } \ No newline at end of file From d63aae8d89d157df824e21184da7e96fb90b933e Mon Sep 17 00:00:00 2001 From: mori49 <84979219+mori49@users.noreply.github.com> Date: Wed, 28 Jan 2026 01:48:56 +0100 Subject: [PATCH 06/35] fix: Methods and testing --- .../compress/colgroup/ColGroupFactory.java | 8 +- .../ColGroupPiecewiseLinearCompressed.java | 85 ++-- ...ColGroupPiecewiseLinearCompressedTest.java | 405 ++++++++++++++---- 3 files changed, 389 insertions(+), 109 deletions(-) diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupFactory.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupFactory.java index 49901004ff0..06bf74b423c 100644 --- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupFactory.java +++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupFactory.java @@ -1080,8 +1080,8 @@ public static AColGroup compressPiecewiseLinearFunctional(IColIndex colIndexes, // Breakpoints bestimmen: Einteilung der Segmente - double targetLoss = 1e-3; - List breakpointsList = computeBreakpoints(cs, column,targetLoss); + double targetLoss = cs.getPiecewiseTargetLoss(); + List breakpointsList = computeBreakpoints(cs, column); int[] breakpoints = breakpointsList.stream().mapToInt(Integer::intValue).toArray(); //Für jedes Segment lineare Regression als kompressionsverfahren @@ -1118,9 +1118,9 @@ public static double[] getColumn(MatrixBlock in, int colIndex) { } return column; } - public static List computeBreakpoints(CompressionSettings cs, double[] column, double targetloss){ + public static List computeBreakpoints(CompressionSettings cs, double[] column){ int n = column.length; - double targetMSE = targetloss; + double targetMSE = cs.getPiecewiseTargetLoss(); // Fall A: kein TargetLoss angegeben -> einfache Variante mit fixem λ if (Double.isNaN(targetMSE) || targetMSE <= 0) { double lambda = 5.0; diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/scheme/ColGroupPiecewiseLinearCompressed.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/scheme/ColGroupPiecewiseLinearCompressed.java index af4b6dad172..71e935643d9 100644 --- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/scheme/ColGroupPiecewiseLinearCompressed.java +++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/scheme/ColGroupPiecewiseLinearCompressed.java @@ -41,9 +41,6 @@ public ColGroupPiecewiseLinearCompressed(IColIndex colIndexes, int[] breakpoints } - - - public static AColGroup create(IColIndex colIndexes, int[] breakpoints, double[] slopes, double[] intercepts, int numRows) { if (breakpoints == null || breakpoints.length < 2) throw new IllegalArgumentException("Need at least one segment"); @@ -68,39 +65,44 @@ public static AColGroup create(IColIndex colIndexes, int[] breakpoints, double[] @Override public void decompressToDenseBlock(DenseBlock db, int rl, int ru, int offR, int offC) { - final int col = colIndexes.get(0); // bei mehreren Spalten: Schleife + // ✅ Vollständige Null-Safety + if (db == null || _colIndexes == null || _colIndexes.size() == 0 || + breakpoints == null || slopes == null || intercepts == null) { + return; + } - // Hole das interne double[] für die Zielspalte(n) - // DenseBlock ist meist row-major, Zugriff per db.values(…) - // Einfachste Variante: Zeilenweise über db.getBlockValues(...) arbeiten. + int numSeg = breakpoints.length - 1; + if (numSeg <= 0 || rl >= ru) { + return; + } - final int numSeg = breakpoints.length - 1; + final int col = _colIndexes.get(0); for (int s = 0; s < numSeg; s++) { - final int segStart = breakpoints[s]; - final int segEnd = breakpoints[s + 1]; - final double a = slopes[s]; - final double b = intercepts[s]; + int segStart = breakpoints[s]; + int segEnd = breakpoints[s + 1]; + if (segStart >= segEnd) continue; // Invalid Segment - // Segment auf angefragten Bereich einschränken - final int rs = Math.max(segStart, rl); - final int re = Math.min(segEnd, ru); - if (rs >= re) - continue; + double a = slopes[s]; + double b = intercepts[s]; - for (int r = rs; r < re; r++) { - double x = r; // selbes x wie beim Fit - double yhat = a * x + b; - - // globale Position im DenseBlock - int gr = r + offR; - int gc = col + offC; + int rs = Math.max(segStart, rl); + int re = Math.min(segEnd, ru); + if (rs >= re) continue; - // db.set(row, col, value) - db.set(gr, gc, yhat); + for (int r = rs; r < re; r++) { + double yhat = a * r + b; + int gr = offR + r; + int gc = offC + col; + + // ✅ Bounds-Check vor set() + if (gr >= 0 && gr < db.numRows() && gc >= 0 && gc < db.numCols()) { + db.set(gr, gc, yhat); + } } } } + @Override protected double computeMxx(double c, Builtin builtin) { return 0; @@ -198,12 +200,26 @@ public void decompressToSparseBlockTransposed(SparseBlockMCSR sb, int nColOut) { @Override public double getIdx(int r, int colIdx) { - return 0; + // ✅ CRUCIAL: Bounds-Check für colIdx! + if (r < 0 || r >= numRows || colIdx < 0 || colIdx >= _colIndexes.size()) { + return 0.0; + } + + // Segment-Suche (sicher jetzt) + int seg = 0; + for (int i = 1; i < breakpoints.length; i++) { + if (r < breakpoints[i]) { + break; + } + seg = i - 1; // seg < numSeg immer! + } + + return slopes[seg] * (double) r + intercepts[seg]; } @Override public int getNumValues() { - return 0; + return breakpoints.length + slopes.length + intercepts.length; } @Override @@ -368,5 +384,18 @@ protected void denseSelection(MatrixBlock selection, ColGroupUtils.P[] points, M public AColGroup[] splitReshape(int multiplier, int nRow, int nColOrg) { return new AColGroup[0]; } + + public int[] getBreakpoints() { + return breakpoints; + } + + public double[] getSlopes() { + return slopes; + } + + + public double[] getIntercepts() { + return intercepts; + } } diff --git a/src/test/java/org/apache/sysds/runtime/compress/colgroup/ColGroupPiecewiseLinearCompressedTest.java b/src/test/java/org/apache/sysds/runtime/compress/colgroup/ColGroupPiecewiseLinearCompressedTest.java index b41155c0e49..c0ca62ce9d5 100644 --- a/src/test/java/org/apache/sysds/runtime/compress/colgroup/ColGroupPiecewiseLinearCompressedTest.java +++ b/src/test/java/org/apache/sysds/runtime/compress/colgroup/ColGroupPiecewiseLinearCompressedTest.java @@ -8,6 +8,7 @@ import org.apache.sysds.runtime.compress.colgroup.scheme.ColGroupPiecewiseLinearCompressed; import org.apache.sysds.runtime.compress.colgroup.ColGroupFactory; +import org.apache.sysds.runtime.data.DenseBlock; import org.apache.sysds.runtime.matrix.data.MatrixBlock; import org.junit.Test; import org.junit.jupiter.api.BeforeEach; @@ -18,87 +19,83 @@ import static org.apache.sysds.runtime.compress.colgroup.ColGroupFactory.*; import static org.junit.Assert.*; -/** - * Tests für PiecewiseLinearColGroupCompressed, fokussiert auf: - * - Konstruktor / create(...) - * - decompressToDenseBlock(...) - */ -//TODO Fix -public class ColGroupPiecewiseLinearCompressedTest { - - private CompressionSettings cs; - // ------------------------------------------------------------- - // 1. create(...) und Konstruktor - // ------------------------------------------------------------- - @BeforeEach - void setUp() { - CompressionSettings cs = new CompressionSettingsBuilder().create(); +public class ColGroupPiecewiseLinearCompressedTest { - } @Test public void testComputeBreakpoints_uniformColumn() { - //cs.setPiecewiseTargetLoss(1e-3); + CompressionSettings cs = new CompressionSettingsBuilder().create(); + cs.setPiecewiseTargetLoss(1e-3); double[] column = {1.0, 1.0, 1.0, 1.0, 1.0}; // ← Test-spezifisch - List breaks = computeBreakpoints(cs, column,1e-3); - assertEquals(Arrays.asList(0,5), breaks); // Erwartet: keine Breaks + List breaks = computeBreakpoints(cs, column); + assertEquals(Arrays.asList(0, 5), breaks); // Erwartet: keine Breaks } @Test public void testComputeBreakpoints_linearIncreasing() { - //cs.setPiecewiseTargetLoss(1e-3); + CompressionSettings cs = new CompressionSettingsBuilder().create(); + cs.setPiecewiseTargetLoss(1e-3); double[] column = {0.0, 1.0, 2.0, 3.0, 4.0}; // ← andere column - List breaks = computeBreakpoints(cs, column,1e-3); + List breaks = computeBreakpoints(cs, column); assertEquals(Arrays.asList(0, 5), breaks); // Erwartet } @Test public void testComputeBreakpoints_highLoss_uniform() { - //cs.setPiecewiseTargetLoss(1.0); // ← andere Loss + CompressionSettings cs = new CompressionSettingsBuilder().create(); + cs.setPiecewiseTargetLoss(10000.0); double[] column = {1.0, 1.0, 1.0, 1.0, 1.0}; - List breaks = computeBreakpoints(cs, column,10000.0); - assertEquals(Arrays.asList(0,5), breaks); + List breaks = computeBreakpoints(cs, column); + assertEquals(Arrays.asList(0, 5), breaks); } + @Test public void testComputeBreakpoints_twoSegments() { + CompressionSettings cs = new CompressionSettingsBuilder().create(); + cs.setPiecewiseTargetLoss(1e-3); // {1,1,1, 2,2,2} → 2 Segmente → [0,3,6] double[] column = {1.0, 1.0, 1.0, 2.0, 2.0, 2.0}; - var breaks = computeBreakpoints(cs, column, 1e-3); + var breaks = computeBreakpoints(cs, column); assertEquals(Arrays.asList(0, 3, 6), breaks); } @Test public void testComputeBreakpoints_noLoss_linear() { + CompressionSettings cs = new CompressionSettingsBuilder().create(); + cs.setPiecewiseTargetLoss(0.0); //cs.setPiecewiseTargetLoss(0.0); double[] column = {0.0, 1.0, 2.0, 3.0, 4.0}; - List breaks = computeBreakpoints(cs, column,0.0); - assertEquals(Arrays.asList(0,5), breaks); // bei 0 Loss alle Breaks + List breaks = computeBreakpoints(cs, column); + assertEquals(Arrays.asList(0, 5), breaks); // bei 0 Loss alle Breaks } + @Test public void testComputeBreakpointsLambda_const() { - double[] column = {1.0, 1.0, 1.0, 1.0, 1.0}; // 5 Werte + double[] column = {1.0, 1.0, 1.0, 1.0, 1.0}; List breaks = computeBreakpointsLambda(column, 5.0); - assertEquals(Arrays.asList(0, 5), breaks); // 0 bis 5 + assertEquals(Arrays.asList(0, 5), breaks); breaks = computeBreakpointsLambda(column, 0.01); - assertEquals(Arrays.asList(0, 5), breaks); // auch mit kleinem lambda + assertEquals(Arrays.asList(0, 5), breaks); } + @Test public void testComputeBreakpointsLambda_twoSegments() { double[] column = {1.0, 1.0, 1.0, 2.0, 2.0, 2.0}; // 6 Werte // mit kleinem lambda -> viele Segmente (kostenlos fast) List breaks = computeBreakpointsLambda(column, 0.01); - assertTrue(breaks.contains(3)); // 3 muss als Grenze enthalten sein - assertEquals(3, breaks.size()); // 0, 3, 6 + assertTrue(breaks.contains(3)); + assertEquals(3, breaks.size()); assertEquals(Arrays.asList(0, 3, 6), breaks); - // mit großem lambda -> nur ein Segment + // mit großem lambda entspricht nur ein Segment breaks = computeBreakpointsLambda(column, 1000.0); assertEquals(Arrays.asList(0, 6), breaks); } + @Test public void testComputeBreakpointsLambda_jumpWithTrend() { double[] column = {0.0, 1.0, 2.0, 10.0, 11.0, 12.0}; @@ -125,6 +122,7 @@ public void testComputeBreakpointsLambda_linear() { assertTrue(breaks.get(0) == 0); assertTrue(breaks.get(breaks.size() - 1) == column.length); } + @Test public void testComputeBreakpointsLambda_edge_lambdaVerySmall() { double[] column = {1.0, 1.1, 1.0, 1.1, 1.0}; @@ -140,6 +138,7 @@ public void testComputeBreakpointsLambda_edge_lambdaVerySmall() { assertTrue(breaks.get(i) >= breaks.get(i - 1)); } } + @Test public void testComputeBreakpointsLambda_edge_lambdaVeryLarge() { double[] column = {1.0, 2.0, 1.5, 2.5, 1.8}; @@ -147,6 +146,7 @@ public void testComputeBreakpointsLambda_edge_lambdaVeryLarge() { List breaks = computeBreakpointsLambda(column, 1000.0); assertEquals(Arrays.asList(0, 5), breaks); } + @Test public void testComputeSegmentCost_emptyOrSingle() { double[] column = {10.0, 20.0, 30.0}; @@ -160,6 +160,7 @@ public void testComputeSegmentCost_emptyOrSingle() { assertEquals(0.0, computeSegmentCost(column, 1, 2), 1e-10); assertEquals(0.0, computeSegmentCost(column, 2, 3), 1e-10); } + @Test public void testComputeSegmentCost_twoConstantPoints() { double[] column = {5.0, 5.0, 1.0, 1.0}; @@ -168,6 +169,7 @@ public void testComputeSegmentCost_twoConstantPoints() { double sse = computeSegmentCost(column, 0, 2); assertEquals(0.0, sse, 1e-10); } + @Test public void testComputeSegmentCost_twoDifferentPoints() { double[] column = {0.0, 2.0, 1.0, 3.0}; @@ -180,15 +182,17 @@ public void testComputeSegmentCost_twoDifferentPoints() { sse = computeSegmentCost(column, 2, 4); assertEquals(0.0, sse, 1e-10); } + @Test public void testComputeSegmentCost_constantThree() { double[] column = {0.0, 0.0, 0.0}; double sse = computeSegmentCost(column, 0, 3); assertEquals(0.0, sse, 1e-10); } + @Test public void testComputeSegmentCost_consistent_with_regression() { - double[] column = {0.0, 2.0, 0.0, 4.0, 0.0, 6.0}; // 6 Punkte + double[] column = {0.0, 2.0, 0.0, 4.0, 0.0, 6.0}; int start = 0, end = 3; double[] ab = regressSegment(column, start, end); @@ -203,6 +207,7 @@ public void testComputeSegmentCost_consistent_with_regression() { double sse = computeSegmentCost(column, start, end); assertEquals(sse_hand, sse, 1e-10); } + @Test public void testComputeTotalSSE_emptyBreaks() { double[] column = {1.0, 2.0, 3.0}; @@ -212,6 +217,7 @@ public void testComputeTotalSSE_emptyBreaks() { // 0 Segmente → Summe über 0 Segmente = 0 assertEquals(0.0, total, 1e-10); } + @Test public void testComputeTotalSSE_singleSegment_all() { double[] column = {1.0, 2.0, 3.0}; @@ -223,6 +229,7 @@ public void testComputeTotalSSE_singleSegment_all() { // Ergebnis muss exakt das gleiche wie der SSE des gesamten Segments sein assertEquals(expected, total, 1e-10); } + @Test public void testComputeTotalSSE_twoSegments() { // Beispiel: [0,0,0] und [1,1,1] (jeweils konstant) @@ -237,6 +244,7 @@ public void testComputeTotalSSE_twoSegments() { assertEquals(0.0, total, 1e-10); assertEquals(sse1 + sse2, total, 1e-10); } + @Test public void testComputeTotalSSE_threeSegments() { // Ein Segment mit drei identischen Werten, zwei Segmente mit jeweils zwei Werten @@ -256,103 +264,101 @@ public void testComputeTotalSSE_threeSegments() { assertEquals(0.0, total, 1e-10); assertEquals(sse1 + sse2 + sse3, total, 1e-10); } + @Test public void testComputeTotalSSE_gapStartEnd() { double[] column = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0}; - List breaks = Arrays.asList(2, 5, 8); // Segmente [2,5), [5,8) + List breaks = Arrays.asList(2, 5, 8); double total = computeTotalSSE(column, breaks); double sse1 = computeSegmentCost(column, 2, 5); double sse2 = computeSegmentCost(column, 5, 8); - // Resultat: Summe der zwei Segmente assertEquals(sse1 + sse2, total, 1e-10); - // Die Indizes <2 und >=8 sind nicht Teil der Segmente und fließen nicht in totalSSE ein } + @Test public void testComputeTotalSSE_oneSegment_identical() { double[] column = {1.0, 2.0, 3.0, 4.0, 5.0}; - - // Vergleich: SSE des gesamten Segments über [0,5) double sseTotal = computeSegmentCost(column, 0, 5); - // Berechnung mit computeTotalSSE und breaks [0,5] List breaks = Arrays.asList(0, 5); double total = computeTotalSSE(column, breaks); - // beide müssen exakt gleich sein assertEquals(sseTotal, total, 1e-10); } + @Test public void testComputeTotalSSE_nonConstant() { double[] column = {0.0, 1.0, 2.0, 3.0, 4.0}; - List breaks = Arrays.asList(0, 2, 5); // [0,2), [2,5) + List breaks = Arrays.asList(0, 2, 5); double total = computeTotalSSE(column, breaks); double sse1 = computeSegmentCost(column, 0, 2); double sse2 = computeSegmentCost(column, 2, 5); - // Sanity-Check: Ergebnis positiv, Summe der beiden SSE assertTrue(total >= 0.0); assertEquals(sse1 + sse2, total, 1e-10); } + @Test public void testComputeTotalSSE_edgeCases() { - // Leere Spalte, Segmente [0,0] → kein Segment - double[] columnEmpty = {}; // length 0 + double[] columnEmpty = {}; List breaksEmpty = Arrays.asList(0, 0); assertEquals(0.0, computeTotalSSE(columnEmpty, breaksEmpty), 1e-10); - // Spalte der Länge 1, ein Segment [0,1) double[] columnOne = {42.0}; List breaksOne = Arrays.asList(0, 1); double total = computeTotalSSE(columnOne, breaksOne); assertEquals(0.0, total, 1e-10); } + @Test public void testRegressSegment_empty() { double[] column = {1.0, 2.0, 3.0}; - double[] result = regressSegment(column, 0, 0); // leer + double[] result = regressSegment(column, 0, 0); assertEquals(0.0, result[0], 1e-10); assertEquals(0.0, result[1], 1e-10); } + @Test public void testRegressSegment_singlePoint() { double[] column = {1.0, 2.0, 3.0}; - double[] result = regressSegment(column, 1, 2); // nur i=1: y=2.0 + double[] result = regressSegment(column, 1, 2); - assertEquals(0.0, result[0], 1e-10); // slope = 0 - assertEquals(2.0, result[1], 1e-10); // intercept = Mittelwert + assertEquals(0.0, result[0], 1e-10); + assertEquals(2.0, result[1], 1e-10); } + @Test public void testRegressSegment_twoIdentical() { double[] column = {5.0, 5.0, 1.0, 1.0}; - double[] result = regressSegment(column, 0, 2); // i=0:5, i=1:5 + double[] result = regressSegment(column, 0, 2); - // Steigung = 0, y = 5.0 + 0*i assertEquals(0.0, result[0], 1e-10); assertEquals(5.0, result[1], 1e-10); } + @Test public void testRegressSegment_twoPoints() { - double[] column = {0.0, 2.0}; // (i=0, y=0), (i=1, y=2) + double[] column = {0.0, 2.0}; double[] result = regressSegment(column, 0, 2); - // Gerade durch (0,0) und (1,2) → y = 2*i + 0 assertEquals(2.0, result[0], 1e-10); assertEquals(0.0, result[1], 1e-10); } + @Test public void testRegressSegment_twoPoints_offset() { - // column[0], column[1], column[2], column[3] → es gibt 4 Werte - double[] column = {1.0, 3.0, 5.0, 7.0}; // z. B. y = 2*x + 1 → bei x=2: y=5, x=3: y=7 - double[] result = regressSegment(column, 2, 4); // Segment [2,4) → i=2,3 - // Gerade durch (2,5), (3,7): slope = 2, intercept = 1 + double[] column = {1.0, 3.0, 5.0, 7.0}; + double[] result = regressSegment(column, 2, 4); + assertEquals(2.0, result[0], 1e-10); assertEquals(1.0, result[1], 1e-10); } + @Test public void testRegressSegment_constant() { double[] column = {3.0, 3.0, 3.0, 3.0}; @@ -361,6 +367,7 @@ public void testRegressSegment_constant() { assertEquals(0.0, result[0], 1e-10); assertEquals(3.0, result[1], 1e-10); } + @Test public void testRegressSegment_linear() { double[] column = new double[4]; @@ -371,13 +378,12 @@ public void testRegressSegment_linear() { double[] result = regressSegment(column, 0, 4); - // Exakt: slope = 1.5, intercept = 2.0 assertEquals(a, result[0], 1e-10); assertEquals(b, result[1], 1e-10); } + @Test public void testRegressSegment_denomZero() { - // fiktiv: ein Segment mit einem Punkt double[] column = {10.0}; double[] result = regressSegment(column, 0, 1); @@ -388,38 +394,283 @@ public void testRegressSegment_denomZero() { @Test public void testCompressPiecewiseLinearFunctional_const() { // 1. MatrixBlock mit einer konstanten Spalte erzeugen - double[] data = {1.0, 1.0, 1.0, 1.0, 1.0}; // 5 Zeilen, 1 Spalte - MatrixBlock in = new MatrixBlock(5, 1, false).quickSetMatrix(data, 5); - + int nrows = 20, ncols = 1; + MatrixBlock in = new MatrixBlock(nrows, ncols, false); + for (int r = 0; r < nrows; r++) + in.set(r, 0, 1.0); // 2. colIndexes für Spalte 0 - IColIndex colIndexes = ColIndexFactory.create(0); - - // 3. Aufruf der Kompressionsfunktion - AColGroup result = ColGroupFactory.compressPiecewiseLinearFunctional(colIndexes, in, new CompressionSettings()); + IColIndex colIndexes = ColIndexFactory.create(new int[]{0}); + // 3. CompressionSettings mit TargetLoss + CompressionSettings cs = new CompressionSettingsBuilder().create(); + cs.setPiecewiseTargetLoss(1e-6); + // 4. Aufruf der Kompressionsfunktion + AColGroup result = ColGroupFactory.compressPiecewiseLinearFunctional(colIndexes, in, cs); - // 4. Ergebnis ist eine ColGroupPiecewiseLinearCompressed? + // 5. Ergebnis ist eine ColGroupPiecewiseLinearCompressed? assertTrue(result instanceof ColGroupPiecewiseLinearCompressed); ColGroupPiecewiseLinearCompressed plGroup = (ColGroupPiecewiseLinearCompressed) result; - // 5. Check Breakpoints: [0, 5] → ein Segment - int[] breakpoints = plGroup.c(); - assertArrayEquals(new int[] {0, 5}, breakpoints); + // 6. Breakpoints per Getter, nicht per create() + int[] breakpoints = plGroup.getBreakpoints(); + assertArrayEquals(new int[]{0, 20}, breakpoints); - // 6. Pro Segment: 1 Segment → ein slope, ein intercept + // 7. Pro Segment: 1 Segment → ein slope, ein intercept double[] slopes = plGroup.getSlopes(); double[] intercepts = plGroup.getIntercepts(); assertEquals(1, slopes.length); assertEquals(1, intercepts.length); - // 7. Für konstante Daten: Steigung ~0, intercept ~1.0 + // 8. Für konstante Daten: Steigung ~0, intercept ~1.0 assertEquals(0.0, slopes[0], 1e-10); - assertEquals(1.0, intercepts[0], 1e-10); // Mittelwert der Spalte + assertEquals(1.0, intercepts[0], 1e-10); + + // 9. Check: colIndexes stimmt + IColIndex idx = plGroup.getColIndices(); + assertEquals(1, idx.size()); + assertEquals(0, idx.get(0)); + } + + @Test(expected = IllegalArgumentException.class) + public void testCreate_nullBreakpoints() { + int[] nullBp = null; + ColGroupPiecewiseLinearCompressed.create( + ColIndexFactory.create(new int[]{0}), nullBp, new double[]{1.0}, new double[]{0.0}, 10); + } + + @Test(expected = IllegalArgumentException.class) + public void testCreate_tooFewBreakpoints() { + int[] singleBp = {0}; + ColGroupPiecewiseLinearCompressed.create( + ColIndexFactory.create(new int[]{0}), singleBp, new double[]{1.0}, new double[]{0.0}, 10); + } + + @Test(expected = IllegalArgumentException.class) + public void testCreate_inconsistentSlopes() { + int[] bp = {0, 5, 10}; + ColGroupPiecewiseLinearCompressed.create( + ColIndexFactory.create(new int[]{0}), bp, new double[]{1.0, 2.0, 3.0}, + new double[]{0.0, 1.0}, 10); + } + + @Test(expected = IllegalArgumentException.class) + public void testCreate_inconsistentIntercepts() { + int[] bp = {0, 5, 10}; + ColGroupPiecewiseLinearCompressed.create( + ColIndexFactory.create(new int[]{0}), bp, new double[]{1.0, 2.0}, + new double[]{0.0}, 10); + } + + @Test + public void testCreate_validMultiSegment() { + int[] bp = {0, 3, 7, 10}; + double[] slopes = {1.0, -2.0, 0.5}; + double[] intercepts = {0.0, 5.0, -1.0}; + IColIndex cols = ColIndexFactory.create(new int[]{0, 1}); + + AColGroup cg = ColGroupPiecewiseLinearCompressed.create(cols, bp, slopes, intercepts, 10); + + assertTrue(cg instanceof ColGroupPiecewiseLinearCompressed); + assertNotSame(bp, ((ColGroupPiecewiseLinearCompressed) cg).getBreakpoints()); + } + + @Test + public void testCreate_multiColumn() { + IColIndex cols = ColIndexFactory.create(new int[]{5, 10, 15}); + int[] bp = {0, 5}; + double[] slopes = {3.0}; + double[] intercepts = {2.0}; - // 8. Check: colIndexes stimmt - assertEquals(1, plGroup.getColIndex().size()); - assertEquals(0, plGroup.getColIndex().get(0)); + AColGroup cg = ColGroupPiecewiseLinearCompressed.create(cols, bp, slopes, intercepts, 100); + assertTrue(cg instanceof ColGroupPiecewiseLinearCompressed); + assertTrue(cg instanceof ColGroupPiecewiseLinearCompressed); + + // + assertTrue(cg.getNumValues() > 0); + + for (int r = 0; r < 5; r++) { + double expected = 3.0 * r + 2.0; + // colIdx=0 → globale Spalte 5 + assertEquals(expected, cg.getIdx(r, 0), 1e-9); + // colIdx=1 → globale Spalte 10 + assertEquals(expected, cg.getIdx(r, 1), 1e-9); + // colIdx=2 → globale Spalte 15 + assertEquals(expected, cg.getIdx(r, 2), 1e-9); + } + + for (int r = 5; r < 10; r++) { + double expected = 3.0 * r + 2.0; + assertEquals(expected, cg.getIdx(r, 0), 1e-9); // Alle Columns gleich + } + assertEquals(cols.size(), 3); } + @Test + public void testCreate_singleColumn() { + IColIndex cols = ColIndexFactory.create(new int[]{5}); + int[] bp = {0, 5}; + double[] slopes = {3.0}; + double[] intercepts = {2.0}; + int numRows = 10; + + AColGroup cg = ColGroupPiecewiseLinearCompressed.create(cols, bp, slopes, intercepts, numRows); + + assertTrue(cg instanceof ColGroupPiecewiseLinearCompressed); + + assertEquals(2.0, cg.getIdx(0, 0), 1e-9); // 3*0 + 2 + assertEquals(5.0, cg.getIdx(1, 0), 1e-9); // 3*1 + 2 + } + + @Test + public void testCreate_validMinimal() { + + // 1 Segment: [0,10] → y = 2.0 * r + 1.0 + int[] bp = {0, 10}; + double[] slopes = {2.0}; + double[] intercepts = {1.0}; + IColIndex cols = ColIndexFactory.create(new int[]{0}); + int numRows = 10; + + AColGroup cg = ColGroupPiecewiseLinearCompressed.create(cols, bp, slopes, intercepts, numRows); + + // Korrekte Instanz + assertTrue(cg instanceof ColGroupPiecewiseLinearCompressed); + + // getNumValues() > 0 + assertTrue(cg.getNumValues() > 0); + + // r < numRows + for (int r = 0; r < numRows; r++) { + double expected = 2.0 * r + 1.0; + assertEquals("Row " + r, expected, cg.getIdx(r, 0), 1e-9); + } + + // Letzte gültige Row + assertEquals(19.0, cg.getIdx(9, 0), 1e-9); + + //Out-of-Bounds korrekt 0.0 + assertEquals(0.0, cg.getIdx(10, 0), 1e-9); + assertEquals(0.0, cg.getIdx(9, 1), 1e-9); + } + + @Test + public void testDecompressToDenseBlock() { + int[] bp = {0, 5, 10}; + double[] slopes = {1.0, 2.0}; + double[] intercepts = {0.0, 1.0}; + int numRows = 10; + + AColGroup cg = ColGroupPiecewiseLinearCompressed.create( + ColIndexFactory.create(new int[]{0}), bp, slopes, intercepts, numRows); + + // 1. MatrixBlock mit korrekten Dimensionen + MatrixBlock target = new MatrixBlock(numRows, 1, false); + + // 2. DenseBlock ZUERST alloziieren! + target.allocateDenseBlock(); // Oder target.allocateDenseBlock(true); + + // 3. Jetzt DenseBlock verfügbar + DenseBlock db = target.getDenseBlock(); + assertNotNull(db); // Sicherstellen! + + // 4. Dekomprimieren + cg.decompressToDenseBlock(db, 0, numRows, 0, 0); + + // 5. Prüfen + for (int r = 0; r < numRows; r++) { + double expected = (r < 5) ? 1.0 * r : 2.0 * r + 1.0; + assertEquals("Row " + r, expected, db.get(r, 0), 1e-9); + } + } + + private ColGroupPiecewiseLinearCompressed createTestGroup(int numRows) { + int[] bp = {0, 5, numRows}; + double[] slopes = {1.0, 3.0}; + double[] intercepts = {0.0, 2.0}; + return (ColGroupPiecewiseLinearCompressed) ColGroupPiecewiseLinearCompressed.create( + ColIndexFactory.create(new int[]{0}), bp, slopes, intercepts, numRows); + } + + @Test + public void testDecompressToDenseBlock_fullRange() { + ColGroupPiecewiseLinearCompressed cg = createTestGroup(12); + + MatrixBlock target = new MatrixBlock(12, 1, false); + target.allocateDenseBlock(); + DenseBlock db = target.getDenseBlock(); + + cg.decompressToDenseBlock(db, 0, 12, 0, 0); + + // Segment 0 [0,5): y = r + assertEquals(0.0, db.get(0, 0), 1e-9); + assertEquals(4.0, db.get(4, 0), 1e-9); + + assertEquals(17.0, db.get(5, 0), 1e-9); + assertEquals(29.0, db.get(9, 0), 1e-9); + assertEquals(32.0, db.get(10, 0), 1e-9); + assertEquals(35.0, db.get(11, 0), 1e-9); + } + + + + @Test + public void testDecompressToDenseBlock_partialRange() { + ColGroupPiecewiseLinearCompressed cg = createTestGroup(12); + + MatrixBlock target = new MatrixBlock(12, 1, false); + target.allocateDenseBlock(); + DenseBlock db = target.getDenseBlock(); + + // rl=6, ru=9 → r=6,7,8 dekomprimieren + // offR=0 → schreibt in Target-Rows 6,7,8 + cg.decompressToDenseBlock(db, 6, 9, 0, 0); + + + assertEquals(0.0, db.get(0, 0), 1e-9); // Unberührt (vor rl=6) + assertEquals(20.0, db.get(6, 0), 1e-9); + assertEquals(23.0, db.get(7, 0), 1e-9); + assertEquals(26.0, db.get(8, 0), 1e-9); + assertEquals(0.0, db.get(9, 0), 1e-9); // Unberührt (nach ru=9) + } + + + @Test + public void testDecompressToDenseBlock_emptyRange() { + ColGroupPiecewiseLinearCompressed cg = createTestGroup(12); + + MatrixBlock target = new MatrixBlock(5, 1, false); + target.allocateDenseBlock(); + DenseBlock db = target.getDenseBlock(); + + // Leerer Bereich + cg.decompressToDenseBlock(db, 12, 12, 0, 0); // rl=ru + cg.decompressToDenseBlock(db, 3, 2, 0, 0); // rl>ru + + // Alles bleibt 0.0 + for (int r = 0; r < 5; r++) { + assertEquals(0.0, db.get(r, 0), 1e-9); + } + } + + @Test + public void testDecompressToDenseBlock_nullSafety() { + ColGroupPiecewiseLinearCompressed cg = createTestGroup(10); + + // Null DenseBlock + cg.decompressToDenseBlock(null, 0, 10, 0, 0); + + // Ungültige Parameter (leerer Bereich) + MatrixBlock target = new MatrixBlock(10, 1, false); + target.allocateDenseBlock(); + DenseBlock db = target.getDenseBlock(); + + cg.decompressToDenseBlock(db, 12, 12, 0, 0); // rl == ru + cg.decompressToDenseBlock(db, 5, 2, 0, 0); // rl > ru + + // Target unverändert + for (int r = 0; r < 10; r++) { + assertEquals(0.0, db.get(r, 0), 1e-9); + } + } From 78460b51a08574557089866bc80c51488950c8da Mon Sep 17 00:00:00 2001 From: mori49 <84979219+mori49@users.noreply.github.com> Date: Wed, 28 Jan 2026 01:52:21 +0100 Subject: [PATCH 07/35] wip: decompressing --- .../colgroup/scheme/ColGroupPiecewiseLinearCompressed.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/scheme/ColGroupPiecewiseLinearCompressed.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/scheme/ColGroupPiecewiseLinearCompressed.java index 71e935643d9..4062c4da611 100644 --- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/scheme/ColGroupPiecewiseLinearCompressed.java +++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/scheme/ColGroupPiecewiseLinearCompressed.java @@ -65,7 +65,7 @@ public static AColGroup create(IColIndex colIndexes, int[] breakpoints, double[] @Override public void decompressToDenseBlock(DenseBlock db, int rl, int ru, int offR, int offC) { - // ✅ Vollständige Null-Safety + if (db == null || _colIndexes == null || _colIndexes.size() == 0 || breakpoints == null || slopes == null || intercepts == null) { return; @@ -95,7 +95,7 @@ public void decompressToDenseBlock(DenseBlock db, int rl, int ru, int offR, int int gr = offR + r; int gc = offC + col; - // ✅ Bounds-Check vor set() + if (gr >= 0 && gr < db.numRows() && gc >= 0 && gc < db.numCols()) { db.set(gr, gc, yhat); } From f42b766d693e95ecff41faaff16f1192deedd738 Mon Sep 17 00:00:00 2001 From: mori49 <84979219+mori49@users.noreply.github.com> Date: Wed, 28 Jan 2026 13:14:57 +0100 Subject: [PATCH 08/35] add: Enum Compressiontype piecewiselinear --- .../org/apache/sysds/runtime/compress/colgroup/AColGroup.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/AColGroup.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/AColGroup.java index 003703f86a4..e2bf69f5c15 100644 --- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/AColGroup.java +++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/AColGroup.java @@ -65,7 +65,7 @@ public abstract class AColGroup implements Serializable { /** Public super types of compression ColGroups supported */ public static enum CompressionType { - UNCOMPRESSED, RLE, OLE, DDC, CONST, EMPTY, SDC, SDCFOR, DDCFOR, DeltaDDC, LinearFunctional; + UNCOMPRESSED, RLE, OLE, DDC, CONST, EMPTY, SDC, SDCFOR, DDCFOR, DeltaDDC, LinearFunctional, PiecewiseLinear; public boolean isDense() { return this == DDC || this == CONST || this == DDCFOR || this == DDCFOR; From 47256c0a14ce2d2b8a0fb0c068bdc82ea039b4a5 Mon Sep 17 00:00:00 2001 From: mori49 <84979219+mori49@users.noreply.github.com> Date: Wed, 28 Jan 2026 13:16:10 +0100 Subject: [PATCH 09/35] add: include functionality of piecewise linear compression --- .../sysds/runtime/compress/colgroup/ColGroupFactory.java | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupFactory.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupFactory.java index 06bf74b423c..c5de46b161c 100644 --- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupFactory.java +++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupFactory.java @@ -304,6 +304,11 @@ else if(ct == CompressionType.LinearFunctional) { return compressLinearFunctional(colIndexes, in, cs); } } + else if(ct == CompressionType.PiecewiseLinear) { + + return compressPiecewiseLinearFunctional(colIndexes, in, cs); + + } else if(ct == CompressionType.DDCFOR) { AColGroup g = directCompressDDC(colIndexes, cg); if(g instanceof ColGroupDDC) @@ -1080,7 +1085,6 @@ public static AColGroup compressPiecewiseLinearFunctional(IColIndex colIndexes, // Breakpoints bestimmen: Einteilung der Segmente - double targetLoss = cs.getPiecewiseTargetLoss(); List breakpointsList = computeBreakpoints(cs, column); int[] breakpoints = breakpointsList.stream().mapToInt(Integer::intValue).toArray(); //Für jedes Segment lineare Regression als kompressionsverfahren From 505c0ccfeaf011650363382a45d69f889fd4fa9b Mon Sep 17 00:00:00 2001 From: mori49 <84979219+mori49@users.noreply.github.com> Date: Wed, 28 Jan 2026 13:47:16 +0100 Subject: [PATCH 10/35] add: Comment --- .../org/apache/sysds/runtime/compress/colgroup/AColGroup.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/AColGroup.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/AColGroup.java index e2bf69f5c15..32a4053c95b 100644 --- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/AColGroup.java +++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/AColGroup.java @@ -64,7 +64,8 @@ public abstract class AColGroup implements Serializable { private static final long serialVersionUID = -1318908671481L; /** Public super types of compression ColGroups supported */ - public static enum CompressionType { + // Enum hinzugefügt -> Brauche ich aber auch das im ColGroupType Enum ergänzen? + public static enum CompressionType { UNCOMPRESSED, RLE, OLE, DDC, CONST, EMPTY, SDC, SDCFOR, DDCFOR, DeltaDDC, LinearFunctional, PiecewiseLinear; public boolean isDense() { From 103abd87c1ef55eedc4f6980da7285539ec87de1 Mon Sep 17 00:00:00 2001 From: mori49 <84979219+mori49@users.noreply.github.com> Date: Wed, 28 Jan 2026 13:49:15 +0100 Subject: [PATCH 11/35] add: dispatch test and remove unused imports --- ...ColGroupPiecewiseLinearCompressedTest.java | 41 ++++++++++++++++--- 1 file changed, 35 insertions(+), 6 deletions(-) diff --git a/src/test/java/org/apache/sysds/runtime/compress/colgroup/ColGroupPiecewiseLinearCompressedTest.java b/src/test/java/org/apache/sysds/runtime/compress/colgroup/ColGroupPiecewiseLinearCompressedTest.java index c0ca62ce9d5..5b3688be5b1 100644 --- a/src/test/java/org/apache/sysds/runtime/compress/colgroup/ColGroupPiecewiseLinearCompressedTest.java +++ b/src/test/java/org/apache/sysds/runtime/compress/colgroup/ColGroupPiecewiseLinearCompressedTest.java @@ -2,20 +2,17 @@ import org.apache.sysds.runtime.compress.CompressionSettings; import org.apache.sysds.runtime.compress.CompressionSettingsBuilder; -import org.apache.sysds.runtime.compress.colgroup.indexes.ArrayIndex; import org.apache.sysds.runtime.compress.colgroup.indexes.ColIndexFactory; import org.apache.sysds.runtime.compress.colgroup.indexes.IColIndex; import org.apache.sysds.runtime.compress.colgroup.scheme.ColGroupPiecewiseLinearCompressed; -import org.apache.sysds.runtime.compress.colgroup.ColGroupFactory; - +import org.apache.sysds.runtime.compress.estim.CompressedSizeInfo; +import org.apache.sysds.runtime.compress.estim.CompressedSizeInfoColGroup; +import org.apache.sysds.runtime.compress.estim.EstimationFactors; import org.apache.sysds.runtime.data.DenseBlock; import org.apache.sysds.runtime.matrix.data.MatrixBlock; import org.junit.Test; -import org.junit.jupiter.api.BeforeEach; - import java.util.Arrays; import java.util.List; - import static org.apache.sysds.runtime.compress.colgroup.ColGroupFactory.*; import static org.junit.Assert.*; @@ -671,7 +668,39 @@ public void testDecompressToDenseBlock_nullSafety() { assertEquals(0.0, db.get(r, 0), 1e-9); } } + private CompressedSizeInfo createTestCompressedSizeInfo() { + IColIndex cols = ColIndexFactory.create(new int[]{0}); + EstimationFactors facts = new EstimationFactors(2, 10); + + CompressedSizeInfoColGroup info = new CompressedSizeInfoColGroup( + cols, facts, AColGroup.CompressionType.PiecewiseLinear); + + List infos = Arrays.asList(info); + CompressedSizeInfo csi = new CompressedSizeInfo(infos); + + return csi; + } + @Test + public void testCompressPiecewiseLinear_viaRealAPI() { + + MatrixBlock in = new MatrixBlock(10, 1, false); + in.allocateDenseBlock(); + for (int r = 0; r < 10; r++) { + in.set(r, 0, r * 0.5); + } + CompressionSettings cs = new CompressionSettingsBuilder() + .addValidCompression(AColGroup.CompressionType.PiecewiseLinear) + .create(); + + CompressedSizeInfo csi = createTestCompressedSizeInfo(); + + List colGroups = ColGroupFactory.compressColGroups(in, csi, cs); + + boolean hasPiecewise = colGroups.stream() + .anyMatch(cg -> cg instanceof ColGroupPiecewiseLinearCompressed); + assertTrue(hasPiecewise); + } } \ No newline at end of file From 31b957de3ea941e437b51527d8afaa7830e35a8c Mon Sep 17 00:00:00 2001 From: mori49 <84979219+mori49@users.noreply.github.com> Date: Thu, 29 Jan 2026 19:50:40 +0100 Subject: [PATCH 12/35] fix: reformat code mit Eclipse XML Profile --- .../runtime/compress/CompressionSettings.java | 333 ++-- .../runtime/compress/colgroup/AColGroup.java | 235 +-- .../compress/colgroup/ColGroupFactory.java | 374 +++-- .../ColGroupPiecewiseLinearCompressed.java | 623 ++++---- ...ColGroupPiecewiseLinearCompressedTest.java | 1335 ++++++++--------- 5 files changed, 1440 insertions(+), 1460 deletions(-) diff --git a/src/main/java/org/apache/sysds/runtime/compress/CompressionSettings.java b/src/main/java/org/apache/sysds/runtime/compress/CompressionSettings.java index d1f97928975..7d5a1dac51a 100644 --- a/src/main/java/org/apache/sysds/runtime/compress/CompressionSettings.java +++ b/src/main/java/org/apache/sysds/runtime/compress/CompressionSettings.java @@ -34,174 +34,171 @@ * CompressionSettingsBuilder for default non static parameters. */ public class CompressionSettings { - private static final Log LOG = LogFactory.getLog(CompressionSettings.class.getName()); + private static final Log LOG = LogFactory.getLog(CompressionSettings.class.getName()); - /** Parallelization threshold for DDC compression */ - public static int PAR_DDC_THRESHOLD = 10000; + /** Parallelization threshold for DDC compression */ + public static int PAR_DDC_THRESHOLD = 10000; + + /** + * Size of the blocks used in a blocked bitmap representation. Note it is exactly Character.MAX_VALUE. This is not + * Character max value + 1 because it breaks the offsets in cases with fully dense values. + */ + public static final int BITMAP_BLOCK_SZ = Character.MAX_VALUE; - /** - * Size of the blocks used in a blocked bitmap representation. Note it is exactly Character.MAX_VALUE. This is not - * Character max value + 1 because it breaks the offsets in cases with fully dense values. - */ - public static final int BITMAP_BLOCK_SZ = Character.MAX_VALUE; + /** + * Sorting of values by physical length helps by 10-20%, especially for serial, while slight performance decrease + * for parallel incl multi-threaded, hence not applied for distributed operations (also because compression time + + * garbage collection increases) + */ + public final boolean sortTuplesByFrequency; - /** - * Sorting of values by physical length helps by 10-20%, especially for serial, while slight performance decrease - * for parallel incl multi-threaded, hence not applied for distributed operations (also because compression time + - * garbage collection increases) - */ - public final boolean sortTuplesByFrequency; - - /** - * The sampling ratio used when choosing ColGroups. Note that, default behavior is to use exact estimator if the - * number of elements is below 1000. - * - * DEPRECATED - */ - public final double samplingRatio; - - /** - * The sampling ratio power to use when choosing sample size. This is used in accordance to the function: - * - * sampleSize += nRows^samplePower; - * - * The value is bounded to be in the range of 0 to 1, 1 giving a sample size of everything, and 0 adding 1. - */ - public final double samplePower; - - /** Share DDC Dictionaries between ColGroups. */ - public final boolean allowSharedDictionary; - - /** Boolean specifying which transpose setting is used, can be auto, true or false */ - public final String transposeInput; - - /** If the seed is -1 then the system used system millisecond time and class hash for seeding. */ - public final int seed; - - /** True if lossy compression is enabled */ - public final boolean lossy; - - /** The selected method for column partitioning used in CoCoding compressed columns */ - public final PartitionerType columnPartitioner; - - /** The cost computation type for the compression */ - public final CostType costComputationType; - - /** The maximum number of columns CoCoded allowed */ - public final int maxColGroupCoCode; - - /** - * A Cocode parameter that differ in behavior based on compression method, in general it is a value that reflects - * aggressively likely coCoding is used. - */ - public final double coCodePercentage; - - /** - * Valid Compressions List, containing the ColGroup CompressionTypes that are allowed to be used for the compression - * Default is to always allow for Uncompromisable ColGroup. - */ - public final EnumSet validCompressions; - - /** The minimum size of the sample extracted. */ - public final int minimumSampleSize; - - /** The maximum size of the sample extracted. */ - public final int maxSampleSize; - - /** The sample type used for sampling */ - public final EstimationType estimationType; - - /** - * Transpose input matrix, to optimize access when extracting bitmaps. This setting is changed inside the script - * based on the transposeInput setting. - * - * This is intentionally left as a mutable value, since the transposition of the input matrix is decided in phase 3. - */ - public boolean transposed = false; - - /** The minimum compression ratio to achieve. */ - public final double minimumCompressionRatio; - - - - /** Is a spark instruction */ - public final boolean isInSparkInstruction; - - /** The sorting type used in sorting/joining offsets to create SDC groups */ - public final SORT_TYPE sdcSortType; - - /** if the settings have been logged already. */ - public static boolean printedStatus = false; - - public final double[] scaleFactors; - - public final boolean preferDeltaEncoding; - - /** - * Ziel-Gesantverlust für piecewise Lineace Komocession• - * Interpretation: maximal entaubter Alobaler MSE pro Went in der Sealte. - * O.O ~ quasi verlustfrei, viele Segmente - * >0 - ~ mehr Approximation entaubt, weniger Segmente - */ - private double piecewiseTargetLoss = Double.NaN; - public void setPiecewiseTargetLoss(double piecewiseTargetLoss) { - this.piecewiseTargetLoss = piecewiseTargetLoss; - - } - public double getPiecewiseTargetLoss() { - return piecewiseTargetLoss; - } - - - protected CompressionSettings(double samplingRatio, double samplePower, boolean allowSharedDictionary, - String transposeInput, int seed, boolean lossy, EnumSet validCompressions, - boolean sortValuesByLength, PartitionerType columnPartitioner, int maxColGroupCoCode, double coCodePercentage, - int minimumSampleSize, int maxSampleSize, EstimationType estimationType, CostType costComputationType, - double minimumCompressionRatio, boolean isInSparkInstruction, SORT_TYPE sdcSortType, double[] scaleFactors, - boolean preferDeltaEncoding) { - this.samplingRatio = samplingRatio; - this.samplePower = samplePower; - this.allowSharedDictionary = allowSharedDictionary; - this.transposeInput = transposeInput; - this.seed = seed == -1 ? (int) System.nanoTime() : seed; - this.validCompressions = validCompressions; - this.lossy = lossy; - this.sortTuplesByFrequency = sortValuesByLength; - this.columnPartitioner = columnPartitioner; - this.maxColGroupCoCode = maxColGroupCoCode; - this.coCodePercentage = coCodePercentage; - this.minimumSampleSize = minimumSampleSize; - this.maxSampleSize = maxSampleSize; - this.estimationType = estimationType; - this.costComputationType = costComputationType; - this.minimumCompressionRatio = minimumCompressionRatio; - this.isInSparkInstruction = isInSparkInstruction; - this.sdcSortType = sdcSortType; - this.scaleFactors = scaleFactors; - this.preferDeltaEncoding = preferDeltaEncoding; - - if(!printedStatus && LOG.isDebugEnabled()) { - printedStatus = true; - LOG.debug(this.toString()); - } - } - - public boolean isRLEAllowed() { - return this.validCompressions.contains(CompressionType.RLE); - } - - @Override - public String toString() { - StringBuilder sb = new StringBuilder(); - sb.append("CompressionSettings: "); - sb.append("\t Valid Compressions: " + validCompressions); - sb.append("\t Share dict: " + allowSharedDictionary); - sb.append("\t Partitioner: " + columnPartitioner); - sb.append("\t Lossy: " + lossy); - sb.append("\t Cost Computation Type: " + costComputationType); - if(samplingRatio < 1.0) - sb.append("\t Estimation Type: " + estimationType); - return sb.toString(); - } -} \ No newline at end of file + /** + * The sampling ratio used when choosing ColGroups. Note that, default behavior is to use exact estimator if the + * number of elements is below 1000. + * + * DEPRECATED + */ + public final double samplingRatio; + + /** + * The sampling ratio power to use when choosing sample size. This is used in accordance to the function: + * + * sampleSize += nRows^samplePower; + * + * The value is bounded to be in the range of 0 to 1, 1 giving a sample size of everything, and 0 adding 1. + */ + public final double samplePower; + + /** Share DDC Dictionaries between ColGroups. */ + public final boolean allowSharedDictionary; + + /** Boolean specifying which transpose setting is used, can be auto, true or false */ + public final String transposeInput; + + /** If the seed is -1 then the system used system millisecond time and class hash for seeding. */ + public final int seed; + + /** True if lossy compression is enabled */ + public final boolean lossy; + + /** The selected method for column partitioning used in CoCoding compressed columns */ + public final PartitionerType columnPartitioner; + + /** The cost computation type for the compression */ + public final CostType costComputationType; + + /** The maximum number of columns CoCoded allowed */ + public final int maxColGroupCoCode; + + /** + * A Cocode parameter that differ in behavior based on compression method, in general it is a value that reflects + * aggressively likely coCoding is used. + */ + public final double coCodePercentage; + + /** + * Valid Compressions List, containing the ColGroup CompressionTypes that are allowed to be used for the compression + * Default is to always allow for Uncompromisable ColGroup. + */ + public final EnumSet validCompressions; + + /** The minimum size of the sample extracted. */ + public final int minimumSampleSize; + + /** The maximum size of the sample extracted. */ + public final int maxSampleSize; + + /** The sample type used for sampling */ + public final EstimationType estimationType; + + /** + * Transpose input matrix, to optimize access when extracting bitmaps. This setting is changed inside the script + * based on the transposeInput setting. + * + * This is intentionally left as a mutable value, since the transposition of the input matrix is decided in phase + * 3. + */ + public boolean transposed = false; + + /** The minimum compression ratio to achieve. */ + public final double minimumCompressionRatio; + + /** Is a spark instruction */ + public final boolean isInSparkInstruction; + + /** The sorting type used in sorting/joining offsets to create SDC groups */ + public final SORT_TYPE sdcSortType; + + /** if the settings have been logged already. */ + public static boolean printedStatus = false; + + public final double[] scaleFactors; + + public final boolean preferDeltaEncoding; + + /** + * Ziel-Gesantverlust für piecewise Lineace Komocession• Interpretation: maximal entaubter Alobaler MSE pro Went in + * der Sealte. O.O ~ quasi verlustfrei, viele Segmente >0 ~ mehr Approximation entaubt, weniger Segmente + */ + private double piecewiseTargetLoss = Double.NaN; + + public void setPiecewiseTargetLoss(double piecewiseTargetLoss) { + this.piecewiseTargetLoss = piecewiseTargetLoss; + + } + + public double getPiecewiseTargetLoss() { + return piecewiseTargetLoss; + } + + protected CompressionSettings(double samplingRatio, double samplePower, boolean allowSharedDictionary, + String transposeInput, int seed, boolean lossy, EnumSet validCompressions, + boolean sortValuesByLength, PartitionerType columnPartitioner, int maxColGroupCoCode, double coCodePercentage, + int minimumSampleSize, int maxSampleSize, EstimationType estimationType, CostType costComputationType, + double minimumCompressionRatio, boolean isInSparkInstruction, SORT_TYPE sdcSortType, double[] scaleFactors, + boolean preferDeltaEncoding) { + this.samplingRatio = samplingRatio; + this.samplePower = samplePower; + this.allowSharedDictionary = allowSharedDictionary; + this.transposeInput = transposeInput; + this.seed = seed == -1 ? (int) System.nanoTime() : seed; + this.validCompressions = validCompressions; + this.lossy = lossy; + this.sortTuplesByFrequency = sortValuesByLength; + this.columnPartitioner = columnPartitioner; + this.maxColGroupCoCode = maxColGroupCoCode; + this.coCodePercentage = coCodePercentage; + this.minimumSampleSize = minimumSampleSize; + this.maxSampleSize = maxSampleSize; + this.estimationType = estimationType; + this.costComputationType = costComputationType; + this.minimumCompressionRatio = minimumCompressionRatio; + this.isInSparkInstruction = isInSparkInstruction; + this.sdcSortType = sdcSortType; + this.scaleFactors = scaleFactors; + this.preferDeltaEncoding = preferDeltaEncoding; + + if(!printedStatus && LOG.isDebugEnabled()) { + printedStatus = true; + LOG.debug(this.toString()); + } + } + + public boolean isRLEAllowed() { + return this.validCompressions.contains(CompressionType.RLE); + } + + @Override + public String toString() { + StringBuilder sb = new StringBuilder(); + sb.append("CompressionSettings: "); + sb.append("\t Valid Compressions: " + validCompressions); + sb.append("\t Share dict: " + allowSharedDictionary); + sb.append("\t Partitioner: " + columnPartitioner); + sb.append("\t Lossy: " + lossy); + sb.append("\t Cost Computation Type: " + costComputationType); + if(samplingRatio < 1.0) + sb.append("\t Estimation Type: " + estimationType); + return sb.toString(); + } +} diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/AColGroup.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/AColGroup.java index 32a4053c95b..d761af7667a 100644 --- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/AColGroup.java +++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/AColGroup.java @@ -55,7 +55,7 @@ /** * Abstract Class that is the lowest class type for the Compression framework. - * + * * AColGroup store information about a number of columns. * */ @@ -64,8 +64,8 @@ public abstract class AColGroup implements Serializable { private static final long serialVersionUID = -1318908671481L; /** Public super types of compression ColGroups supported */ - // Enum hinzugefügt -> Brauche ich aber auch das im ColGroupType Enum ergänzen? - public static enum CompressionType { + // Enum hinzugefügt -> Brauche ich aber auch das im ColGroupType Enum ergänzen? + public static enum CompressionType { UNCOMPRESSED, RLE, OLE, DDC, CONST, EMPTY, SDC, SDCFOR, DDCFOR, DeltaDDC, LinearFunctional, PiecewiseLinear; public boolean isDense() { @@ -83,7 +83,7 @@ public boolean isSDC() { /** * Concrete ColGroupType - * + * * Protected such that outside the ColGroup package it should be unknown which specific subtype is used. */ protected static enum ColGroupType { @@ -96,7 +96,7 @@ protected static enum ColGroupType { /** * Main constructor. - * + * * @param colIndices offsets of the columns in the matrix block that make up the group */ protected AColGroup(IColIndex colIndices) { @@ -105,7 +105,7 @@ protected AColGroup(IColIndex colIndices) { /** * Obtain the offsets of the columns in the matrix block that make up the group - * + * * @return offsets of the columns in the matrix block that make up the group */ public final IColIndex getColIndices() { @@ -114,7 +114,7 @@ public final IColIndex getColIndices() { /** * Obtain the number of columns in this column group. - * + * * @return number of columns in this column group */ public final int getNumCols() { @@ -125,9 +125,9 @@ public final int getNumCols() { * Shift all column indexes contained by an offset. * * This is used for rbind to combine compressed matrices. - * + * * Since column indexes are reused between operations, we allocate a new list here to be safe - * + * * @param offset The offset to move all columns * @return A new column group object with the shifted columns */ @@ -139,7 +139,7 @@ public final AColGroup shiftColIndices(int offset) { * Copy the content of the column group with pointers to the previous content but with new column given Note this * method does not verify if the colIndexes specified are valid and correct dimensions for the underlying column * groups. - * + * * @param colIndexes the new indexes to use in the copy * @return a new object with pointers to underlying data. */ @@ -147,7 +147,7 @@ public final AColGroup shiftColIndices(int offset) { /** * Get the upper bound estimate of in memory allocation for the column group. - * + * * @return an upper bound on the number of bytes used to store this ColGroup in memory. */ public long estimateInMemorySize() { @@ -158,9 +158,9 @@ public long estimateInMemorySize() { /** * Decompress a range of rows into a sparse block - * + * * Note that this is using append, so the sparse column indexes need to be sorted afterwards. - * + * * @param sb Sparse Target block * @param rl Row to start at * @param ru Row to end at @@ -171,7 +171,7 @@ public final void decompressToSparseBlock(SparseBlock sb, int rl, int ru) { /** * Decompress a range of rows into a dense block - * + * * @param db Dense target block * @param rl Row to start at * @param ru Row to end at @@ -182,7 +182,7 @@ public final void decompressToDenseBlock(DenseBlock db, int rl, int ru) { /** * Decompress a range of rows into a dense transposed block. - * + * * @param db Dense target block * @param rl Row in this column group to start at. * @param ru Row in this column group to end at. @@ -192,7 +192,7 @@ public final void decompressToDenseBlock(DenseBlock db, int rl, int ru) { /** * Decompress the column group to the sparse transposed block. Note that the column groups would only need to * decompress into specific sub rows of the Sparse block - * + * * @param sb Sparse target block * @param nColOut The number of columns in the sb. */ @@ -200,7 +200,7 @@ public final void decompressToDenseBlock(DenseBlock db, int rl, int ru) { /** * Serializes column group to data output. - * + * * @param out data output * @throws IOException if IOException occurs */ @@ -213,7 +213,7 @@ protected void write(DataOutput out) throws IOException { /** * Returns the exact serialized size of column group. This can be used for example for buffer preallocation. - * + * * @return exact serialized size for column group */ public long getExactSizeOnDisk() { @@ -226,11 +226,11 @@ public long getExactSizeOnDisk() { /** * Slice out the columns within the range of cl and cu to remove the dictionary values related to these columns. If * the ColGroup slicing from does not contain any columns within the range null is returned. - * + * * @param cl The lower bound of the columns to select * @param cu The upper bound of the columns to select (not inclusive). * @return A cloned Column Group, with a copied pointer to the old column groups index structure, but reduced - * dictionary and _columnIndexes correctly aligned with the expected sliced compressed matrix. + * dictionary and _columnIndexes correctly aligned with the expected sliced compressed matrix. */ public final AColGroup sliceColumns(int cl, int cu) { if(cl <= _colIndexes.get(0) && cu > _colIndexes.get(_colIndexes.size() - 1)) { @@ -248,10 +248,10 @@ else if(cu - cl == 1) /** * Slice out a single column from the column group. - * + * * @param col The column to slice, the column could potentially not be inside the column group * @return A new column group that is a single column, if the column requested is not in this column group null is - * returned. + * returned. */ public final AColGroup sliceColumn(int col) { int idx = _colIndexes.findIndex(col); @@ -263,11 +263,11 @@ public final AColGroup sliceColumn(int col) { /** * Slice out multiple columns within the interval between the given indexes. - * + * * @param cl The lower column index to slice from * @param cu The upper column index to slice to, (not included) * @return A column group of this containing the columns specified, returns null if the columns specified is not - * contained in the column group + * contained in the column group */ protected final AColGroup sliceMultiColumns(int cl, int cu) { SliceResult sr = _colIndexes.slice(cl, cu); @@ -279,7 +279,7 @@ protected final AColGroup sliceMultiColumns(int cl, int cu) { /** * Compute the column sum of the given list of groups - * + * * @param groups The Groups to sum * @param res The result to put the values into * @param nRows The number of rows in the groups @@ -293,9 +293,9 @@ public static double[] colSum(Collection groups, double[] res, int nR /** * Get the value at a global row/column position. - * + * * In general this performs since a binary search of colIndexes is performed for each lookup. - * + * * @param r row * @param c column * @return value at the row/column position @@ -310,7 +310,7 @@ public double get(int r, int c) { /** * Get the value at a colGroup specific row/column index position. - * + * * @param r row * @param colIdx column index in the _colIndexes. * @return value at the row/column index position @@ -319,16 +319,16 @@ public double get(int r, int c) { /** * Obtain number of distinct tuples in contained sets of values associated with this column group. - * + * * If the column group is uncompressed the number or rows is returned. - * + * * @return the number of distinct sets of values associated with the bitmaps in this column group */ public abstract int getNumValues(); /** * Obtain the compression type. - * + * * @return How the elements of the column group are compressed. */ public abstract CompressionType getCompType(); @@ -336,14 +336,14 @@ public double get(int r, int c) { /** * Internally get the specific type of ColGroup, this could be extracted from the object but that does not allow for * nice switches in the code. - * + * * @return ColGroupType of the object. */ protected abstract ColGroupType getColGroupType(); /** * Decompress into the DenseBlock. (no NNZ handling) - * + * * @param db Target DenseBlock * @param rl Row to start decompression from * @param ru Row to end decompression at (not inclusive) @@ -354,10 +354,10 @@ public double get(int r, int c) { /** * Decompress into the SparseBlock. (no NNZ handling) - * + * * Note this method is allowing to calls to append since it is assumed that the sparse column indexes are sorted * afterwards - * + * * @param sb Target SparseBlock * @param rl Row to start decompression from * @param ru Row to end decompression at (not inclusive) @@ -368,9 +368,9 @@ public double get(int r, int c) { /** * Right matrix multiplication with this column group. - * + * * This method can return null, meaning that the output overlapping group would have been empty. - * + * * @param right The MatrixBlock on the right of this matrix multiplication * @return The new Column Group or null that is the result of the matrix multiplication. */ @@ -380,9 +380,9 @@ public final AColGroup rightMultByMatrix(MatrixBlock right) { /** * Right matrix multiplication with this column group. - * + * * This method can return null, meaning that the output overlapping group would have been empty. - * + * * @param right The MatrixBlock on the right of this matrix multiplication * @param allCols A pre-materialized list of all col indexes, that can be shared across all column groups if use * full, can be set to null. @@ -393,7 +393,7 @@ public final AColGroup rightMultByMatrix(MatrixBlock right) { /** * Right side Matrix multiplication, iterating though this column group and adding to the ret - * + * * @param right Right side matrix to multiply with. * @param ret The return matrix to add results to * @param rl The row of this column group to multiply from @@ -402,18 +402,20 @@ public final AColGroup rightMultByMatrix(MatrixBlock right) { * @param cru The right hand side column upper * @param nRows The number of rows in this column group */ - public void rightDecompressingMult(MatrixBlock right, MatrixBlock ret, int rl, int ru, int nRows, int crl, int cru){ - throw new NotImplementedException("not supporting right Decompressing Multiply on class: " + this.getClass().getSimpleName()); + public void rightDecompressingMult(MatrixBlock right, MatrixBlock ret, int rl, int ru, int nRows, int crl, + int cru) { + throw new NotImplementedException( + "not supporting right Decompressing Multiply on class: " + this.getClass().getSimpleName()); } /** * Do a transposed self matrix multiplication on the left side t(x) %*% x. but only with this column group. - * + * * This gives better performance since there is no need to iterate through all the rows of the matrix, but the * execution can be limited to its number of distinct values. - * + * * Note it only calculate the upper triangle - * + * * @param ret The return matrix block [numColumns x numColumns] * @param nRows The number of rows in the column group */ @@ -421,7 +423,7 @@ public void rightDecompressingMult(MatrixBlock right, MatrixBlock ret, int rl, i /** * Left multiply with this column group. - * + * * @param matrix The matrix to multiply with on the left * @param result The result to output the values into, always dense for the purpose of the column groups * parallelizing @@ -435,7 +437,7 @@ public abstract void leftMultByMatrixNoPreAgg(MatrixBlock matrix, MatrixBlock re /** * Left side matrix multiplication with a column group that is transposed. - * + * * @param lhs The left hand side Column group to multiply with, the left hand side should be considered * transposed. Also it should be guaranteed that this column group is not empty. * @param result The result matrix to insert the result of the multiplication into @@ -445,16 +447,16 @@ public abstract void leftMultByMatrixNoPreAgg(MatrixBlock matrix, MatrixBlock re /** * Matrix multiply with this other column group, but: - * + * * 1. Only output upper triangle values. - * + * * 2. Multiply both ways with "this" being on the left and on the right. - * + * * It should be guaranteed that the input is not the same as the caller of the method. - * + * * The second step is achievable by treating the initial multiplied matrix, and adding its values to the correct * locations in the output. - * + * * @param other The other Column group to multiply with * @param result The result matrix to put the results into */ @@ -463,7 +465,7 @@ public abstract void leftMultByMatrixNoPreAgg(MatrixBlock matrix, MatrixBlock re /** * Perform the specified scalar operation directly on the compressed column group, without decompressing individual * cells if possible. - * + * * @param op operation to perform * @return version of this column group with the operation applied */ @@ -471,7 +473,7 @@ public abstract void leftMultByMatrixNoPreAgg(MatrixBlock matrix, MatrixBlock re /** * Perform a binary row operation. - * + * * @param op The operation to execute * @param v The vector of values to apply the values contained should be at least the length of the highest * value in the column index @@ -482,7 +484,7 @@ public abstract void leftMultByMatrixNoPreAgg(MatrixBlock matrix, MatrixBlock re /** * Short hand add operator call on column group to add a row vector to the column group - * + * * @param v The vector to add * @return A new column group where the vector is added. */ @@ -492,7 +494,7 @@ public AColGroup addVector(double[] v) { /** * Perform a binary row operation. - * + * * @param op The operation to execute * @param v The vector of values to apply the values contained should be at least the length of the highest * value in the column index @@ -504,9 +506,9 @@ public AColGroup addVector(double[] v) { /** * Unary Aggregate operator, since aggregate operators require new object output, the output becomes an uncompressed * matrix. - * + * * The range of rl to ru only applies to row aggregates. (ReduceCol) - * + * * @param op The operator used * @param c The output matrix block * @param nRows The total number of rows in the Column Group @@ -517,9 +519,9 @@ public AColGroup addVector(double[] v) { /** * Slice out column at specific index of this column group. - * + * * It is guaranteed that the column to slice is contained in this columnGroup. - * + * * @param idx The column index to slice out. * @return A new column group containing the columns inside. (never null) */ @@ -527,9 +529,9 @@ public AColGroup addVector(double[] v) { /** * Slice range of columns inside this column group. - * + * * It is guaranteed that the columns to slice is contained in this columnGroup. - * + * * @param idStart The column index to start at * @param idEnd The column index to end at (not included) * @param outputCols The output columns to extract materialized for ease of implementation @@ -539,9 +541,10 @@ public AColGroup addVector(double[] v) { /** * Slice range of rows out of the column group and return a new column group only containing the row segment. - * - * Note that this slice should maintain pointers back to the original dictionaries and only modify index structures. - * + * + * Note that this slice should maintain pointers back to the original dictionaries and only modify index + * structures. + * * @param rl The row to start at * @param ru The row to end at (not included) * @return A new column group containing the specified row range. @@ -550,21 +553,21 @@ public AColGroup addVector(double[] v) { /** * Short hand method for getting minimum value contained in this column group. - * + * * @return The minimum value contained in this ColumnGroup */ public abstract double getMin(); /** * Short hand method for getting maximum value contained in this column group. - * + * * @return The maximum value contained in this ColumnGroup */ public abstract double getMax(); /** * Short hand method for getting the sum of this column group - * + * * @param nRows The number of rows in the column group * @return The sum of this column group */ @@ -572,7 +575,7 @@ public AColGroup addVector(double[] v) { /** * Detect if the column group contains a specific value. - * + * * @param pattern The value to look for. * @return boolean saying true if the value is contained. */ @@ -580,7 +583,7 @@ public AColGroup addVector(double[] v) { /** * Get the number of nonZeros contained in this column group. - * + * * @param nRows The number of rows in the column group, this is used for groups that does not contain information * about how many rows they have. * @return The nnz. @@ -589,7 +592,7 @@ public AColGroup addVector(double[] v) { /** * Make a copy of the column group values, and replace all values that match pattern with replacement value. - * + * * @param pattern The value to look for * @param replace The value to replace the other value with * @return A new Column Group, reusing the index structure but with new values. @@ -598,7 +601,7 @@ public AColGroup addVector(double[] v) { /** * Compute the column sum - * + * * @param c The array to add the column sum to. * @param nRows The number of rows in the column group. */ @@ -606,7 +609,7 @@ public AColGroup addVector(double[] v) { /** * Central Moment instruction executed on a column group. - * + * * @param op The Operator to use. * @param nRows The number of rows contained in the ColumnGroup. * @return A Central Moment object. @@ -615,7 +618,7 @@ public AColGroup addVector(double[] v) { /** * Expand the column group to multiple columns. (one hot encode the column group) - * + * * @param max The number of columns to expand to and cutoff values at. * @param ignore If zero and negative values should be ignored. * @param cast If the double values contained should be cast to whole numbers. @@ -626,7 +629,7 @@ public AColGroup addVector(double[] v) { /** * Get the computation cost associated with this column group. - * + * * @param e The computation cost estimator * @param nRows the number of rows in the column group * @return The cost of this column group @@ -635,7 +638,7 @@ public AColGroup addVector(double[] v) { /** * Perform unary operation on the column group and return a new column group - * + * * @param op The operation to perform * @return The new column group */ @@ -643,19 +646,19 @@ public AColGroup addVector(double[] v) { /** * Get if the group is only containing zero - * + * * @return true if empty */ public abstract boolean isEmpty(); /** - * Append the other column group to this column group. This method tries to combine them to return a new column group - * containing both. In some cases it is possible in reasonable time, in others it is not. - * + * Append the other column group to this column group. This method tries to combine them to return a new column + * group containing both. In some cases it is possible in reasonable time, in others it is not. + * * The result is first this column group followed by the other column group in higher row values. - * + * * If it is not possible or very inefficient null is returned. - * + * * @param g The other column group * @return A combined column group or null */ @@ -663,9 +666,9 @@ public AColGroup addVector(double[] v) { /** * Append all column groups in the list provided together in one go allocating the output once. - * + * * If it is not possible or very inefficient null is returned. - * + * * @param groups The groups to combine. * @param blen The normal number of rows in the groups * @param rlen The total number of rows of all combined. @@ -677,11 +680,11 @@ public static AColGroup appendN(AColGroup[] groups, int blen, int rlen) { /** * Append all column groups in the list provided together with this. - * + * * A Important detail is the first entry in the group == this, and should not be appended twice. - * + * * If it is not possible or very inefficient null is returned. - * + * * @param groups The groups to combine. * @param blen The normal number of rows in the groups * @param rlen The total number of rows of all combined. @@ -691,7 +694,7 @@ public static AColGroup appendN(AColGroup[] groups, int blen, int rlen) { /** * Get the compression scheme for this column group to enable compression of other data. - * + * * @return The compression scheme of this column group */ public abstract ICLAScheme getCompressionScheme(); @@ -705,14 +708,14 @@ public void clear() { /** * Recompress this column group into a new column group. - * + * * @return A new or the same column group depending on optimization goal. */ public abstract AColGroup recompress(); /** * Recompress this column group into a new column group of the given type. - * + * * @param ct The compressionType that the column group should morph into * @param nRow The number of rows in this columngroup. * @return A new column group @@ -742,7 +745,7 @@ else if(ct == CompressionType.UNCOMPRESSED) { /** * Get the compression info for this column group. - * + * * @param nRow The number of rows in this column group. * @return The compression info for this group. */ @@ -750,7 +753,7 @@ else if(ct == CompressionType.UNCOMPRESSED) { /** * Combine this column group with another - * + * * @param other The other column group to combine with. * @param nRow The number of rows in both column groups. * @return A combined representation as a column group. @@ -761,7 +764,7 @@ public AColGroup combine(AColGroup other, int nRow) { /** * Get encoding of this column group. - * + * * @return The encoding of the index structure. */ public IEncode getEncoding() { @@ -782,19 +785,19 @@ public AColGroup sortColumnIndexes() { /** * Perform row sum on the internal dictionaries, and return the same index structure. - * + * * This method returns null on empty column groups. - * + * * Note this method does not guarantee correct behavior if the given group is AMorphingGroup, instead it should be * morphed to a valid columngroup via extractCommon first. - * + * * @return The reduced colgroup. */ public abstract AColGroup reduceCols(); /** * Selection (left matrix multiply) - * + * * @param selection A sparse matrix with "max" a single one in each row all other values are zero. * @param points The coordinates in the selection matrix to extract. * @param ret The MatrixBlock to decompress the selected rows into @@ -807,17 +810,17 @@ public final void selectionMultiply(MatrixBlock selection, P[] points, MatrixBlo else denseSelection(selection, points, ret, rl, ru); } - + /** * Get an approximate sparsity of this column group - * + * * @return the approximate sparsity of this columngroup */ public abstract double getSparsity(); /** * Sparse selection (left matrix multiply) - * + * * @param selection A sparse matrix with "max" a single one in each row all other values are zero. * @param points The coordinates in the selection matrix to extract. * @param ret The Sparse MatrixBlock to decompress the selected rows into @@ -828,7 +831,7 @@ public final void selectionMultiply(MatrixBlock selection, P[] points, MatrixBlo /** * Dense selection (left matrix multiply) - * + * * @param selection A sparse matrix with "max" a single one in each row all other values are zero. * @param points The coordinates in the selection matrix to extract. * @param ret The Dense MatrixBlock to decompress the selected rows into @@ -840,7 +843,7 @@ public final void selectionMultiply(MatrixBlock selection, P[] points, MatrixBlo /** * Method to determine if the columnGroup have the same index structure as another. Note that the column indexes and * dictionaries are allowed to be different. - * + * * @param that the other column group * @return if the index is the same. */ @@ -851,7 +854,7 @@ public boolean sameIndexStructure(AColGroup that) { /** * C bind the list of column groups with this column group. the list of elements provided in the index of each list * is guaranteed to have the same index structures - * + * * @param nRow The number of rows contained in all right and this column group. * @param nCol The number of columns to shift the right hand side column groups over when combining, this should * only effect the column indexes @@ -889,7 +892,7 @@ public AColGroup combineWithSameIndex(int nRow, int nCol, List right) /** * C bind the given column group to this. - * + * * @param nRow The number of rows contained in the right and this column group. * @param nCol The number of columns in this. * @param right The column group to c-bind. @@ -929,16 +932,16 @@ protected IColIndex combineColIndexes(final int nCol, List right) { /** * This method returns a list of column groups that are naive splits of this column group as if it is reshaped. - * + * * This means the column groups rows are split into x number of other column groups where x is the multiplier. - * + * * The indexes are assigned round robbin to each of the output groups, meaning the first index is assigned. - * + * * If for instance the 4. column group is split by a 2 multiplier and there was 5 columns in total originally. The * output becomes 2 column groups at column index 4 and one at 9. - * + * * If possible the split column groups should reuse pointers back to the original dictionaries! - * + * * @param multiplier The number of column groups to split into * @param nRow The number of rows in this column group in case the underlying column group does not know * @param nColOrg The number of overall columns in the host CompressedMatrixBlock. @@ -948,25 +951,25 @@ protected IColIndex combineColIndexes(final int nCol, List right) { /** * This method returns a list of column groups that are naive splits of this column group as if it is reshaped. - * + * * This means the column groups rows are split into x number of other column groups where x is the multiplier. - * + * * The indexes are assigned round robbin to each of the output groups, meaning the first index is assigned. - * + * * If for instance the 4. column group is split by a 2 multiplier and there was 5 columns in total originally. The * output becomes 2 column groups at column index 4 and one at 9. - * + * * If possible the split column groups should reuse pointers back to the original dictionaries! - * + * * This specific variation is pushing down the parallelization given via the executor service provided. If not * overwritten the default is to call the normal split reshape - * + * * @param multiplier The number of column groups to split into * @param nRow The number of rows in this column group in case the underlying column group does not know * @param nColOrg The number of overall columns in the host CompressedMatrixBlock * @param pool The executor service to submit parallel tasks to - * @throws Exception In case there is an error we throw the exception out instead of handling it * @return a list of split column groups + * @throws Exception In case there is an error we throw the exception out instead of handling it */ public AColGroup[] splitReshapePushDown(final int multiplier, final int nRow, final int nColOrg, final ExecutorService pool) throws Exception { diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupFactory.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupFactory.java index c5de46b161c..67f2c492e09 100644 --- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupFactory.java +++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupFactory.java @@ -107,7 +107,7 @@ private ColGroupFactory(MatrixBlock in, CompressedSizeInfo csi, CompressionSetti /** * The actual compression method, that handles the logic of compressing multiple columns together. - * + * * @param in The input matrix, that could have been transposed. If it is transposed the compSettings should specify * this. * @param csi The compression information extracted from the estimation, this contains which groups of columns to @@ -121,7 +121,7 @@ public static List compressColGroups(MatrixBlock in, CompressedSizeIn /** * The actual compression method, that handles the logic of compressing multiple columns together. - * + * * @param in The input matrix, that could have been transposed. If it is transposed the compSettings should specify * this. * @param csi The compression information extracted from the estimation, this contains which groups of columns to @@ -136,7 +136,7 @@ public static List compressColGroups(MatrixBlock in, CompressedSizeIn } /** - * + * * @param in The input matrix, that could have been transposed. If it is transposed the compSettings should specify * this. * @param csi The compression information extracted from the estimation, this contains which groups of columns to @@ -233,8 +233,9 @@ private void logEstVsActual(double time, AColGroup act, CompressedSizeInfoColGro time, retType, estC, actC, act.getNumValues(), cols, wanted, warning)); } else { - LOG.debug(String.format("time[ms]: %10.2f %25s est %10.0f -- act %10.0f distinct:%5d cols:%s wanted:%s", - time, retType, estC, actC, act.getNumValues(), cols, wanted)); + LOG.debug( + String.format("time[ms]: %10.2f %25s est %10.0f -- act %10.0f distinct:%5d cols:%s wanted:%s", time, + retType, estC, actC, act.getNumValues(), cols, wanted)); } } @@ -304,11 +305,11 @@ else if(ct == CompressionType.LinearFunctional) { return compressLinearFunctional(colIndexes, in, cs); } } - else if(ct == CompressionType.PiecewiseLinear) { + else if(ct == CompressionType.PiecewiseLinear) { - return compressPiecewiseLinearFunctional(colIndexes, in, cs); + return compressPiecewiseLinearFunctional(colIndexes, in, cs); - } + } else if(ct == CompressionType.DDCFOR) { AColGroup g = directCompressDDC(colIndexes, cg); if(g instanceof ColGroupDDC) @@ -704,7 +705,7 @@ private AColGroup directCompressDeltaDDC(IColIndex colIndexes, CompressedSizeInf if(cs.scaleFactors != null) { throw new NotImplementedException("Delta encoding with quantization not yet implemented"); } - + if(colIndexes.size() > 1) { return directCompressDeltaDDCMultiCol(colIndexes, cg); } @@ -736,7 +737,7 @@ private AColGroup directCompressDeltaDDCSingleCol(IColIndex colIndexes, Compress if(map.size() == 0) return new ColGroupEmpty(colIndexes); - + final double[] dictValues = map.getDictionary(); IDictionary dict = new DeltaDictionary(dictValues, 1); @@ -745,7 +746,8 @@ private AColGroup directCompressDeltaDDCSingleCol(IColIndex colIndexes, Compress return ColGroupDeltaDDC.create(colIndexes, dict, resData, null); } - private AColGroup directCompressDeltaDDCMultiCol(IColIndex colIndexes, CompressedSizeInfoColGroup cg) throws Exception { + private AColGroup directCompressDeltaDDCMultiCol(IColIndex colIndexes, CompressedSizeInfoColGroup cg) + throws Exception { final AMapToData d = MapToFactory.create(nRow, Math.max(Math.min(cg.getNumOffs() + 1, nRow), 126)); final int fill = d.getUpperBoundValue(); d.fill(fill); @@ -824,8 +826,8 @@ private boolean readToMapDDC(IColIndex colIndexes, DblArrayCountHashMap map, AMa int fill) { ReaderColumnSelection reader = (cs.scaleFactors == null) ? ReaderColumnSelection.createReader(in, colIndexes, - cs.transposed, rl, - ru) : ReaderColumnSelection.createQuantizedReader(in, colIndexes, cs.transposed, rl, ru, cs.scaleFactors); + cs.transposed, rl, ru) : ReaderColumnSelection.createQuantizedReader(in, colIndexes, cs.transposed, rl, ru, + cs.scaleFactors); DblArray cellVals = reader.nextRow(); boolean extra = false; @@ -1072,185 +1074,179 @@ private static AColGroup compressLinearFunctional(IColIndex colIndexes, MatrixBl return ColGroupLinearFunctional.create(colIndexes, coefficients, numRows); } - public static AColGroup compressPiecewiseLinearFunctional(IColIndex colIndexes, MatrixBlock in, CompressionSettings cs) { - - - //Erstmal den Inhalt einer Spalte speichern - - int numRows = in.getNumRows(); - int colIdx = colIndexes.get(0); //Die erste Spalte - double[] column = getColumn(in,colIdx); - - //Sette den Targetloss - - // Breakpoints bestimmen: Einteilung der Segmente - - List breakpointsList = computeBreakpoints(cs, column); - int[] breakpoints = breakpointsList.stream().mapToInt(Integer::intValue).toArray(); - //Für jedes Segment lineare Regression als kompressionsverfahren - - // 3) Pro Segment Regression -> a,b - int numSeg = breakpoints.length - 1; - double[] slopes = new double[numSeg]; - double[] intercepts = new double[numSeg]; - - for (int s = 0; s < numSeg; s++) { - int start = breakpoints[s]; - int end = breakpoints[s + 1]; - - double[] ab = regressSegment(column, start, end); // nutzt gleiche Stats wie computeSegmentCost - slopes[s] = ab[0]; - intercepts[s] = ab[1]; - } - //Erstelle die Datenstruktur: PiecewiseLinearColGroupCompressed - - return ColGroupPiecewiseLinearCompressed.create( - colIndexes, - breakpoints, - slopes, - intercepts, - numRows); - } - - - public static double[] getColumn(MatrixBlock in, int colIndex) { - int numRows = in.getNumRows(); // Anzahl der Zeilen [web:16] - double[] column = new double[numRows]; // Variable für die Spalte - - for (int r = 0; r < numRows; r++) { - column[r] = in.get(r, colIndex); // Wert (r, colIndex) lesen [web:16][web:25] - } - return column; - } - public static List computeBreakpoints(CompressionSettings cs, double[] column){ - int n = column.length; - double targetMSE = cs.getPiecewiseTargetLoss(); - // Fall A: kein TargetLoss angegeben -> einfache Variante mit fixem λ - if (Double.isNaN(targetMSE) || targetMSE <= 0) { - double lambda = 5.0; - return computeBreakpointsLambda(column, lambda); - } - - // Fall B: TargetLoss gesetzt -> globales Fehlerbudget berücksichtigen - double sseMax = n * targetMSE; // MSE -> SSE-Budget - - double lambdaMin = 0.0; // viele Segmente, minimaler Fehler - double lambdaMax = 1e6; // wenige Segmente, mehr Fehler - - List bestBreaks = null; - - for (int it = 0; it < 20; it++) { // Binärsuche auf λ - double lambda = 0.5 * (lambdaMin + lambdaMax); - - List breaks = computeBreakpointsLambda(column, lambda); - double totalSSE = computeTotalSSE(column, breaks); - - if (totalSSE <= sseMax) { - // Budget eingehalten: wir können versuchen, mit größerem λ noch weniger Segmente zu nehmen - bestBreaks = breaks; - lambdaMin = lambda; - } else { - // Fehler zu groß: λ verkleinern, mehr Segmente zulassen - lambdaMax = lambda; - } - } - - if (bestBreaks == null) - bestBreaks = computeBreakpointsLambda(column, lambdaMin); - - return bestBreaks; - } - public static List computeBreakpointsLambda(double[] column, double lambda) { - int sizeColumn = column.length; - double[] dp = new double[sizeColumn + 1]; - int[] prev = new int[sizeColumn + 1]; - - dp[0] = 0.0; - - for (int index = 1; index <= sizeColumn; index++) { - dp[index] = Double.POSITIVE_INFINITY; - for (int i = 0; i < index; i++) { // Segment [i, index) - double costCurrentSegment = computeSegmentCost(column, i, index); // SSE - double candidateCost = dp[i] + costCurrentSegment + lambda; - if (candidateCost < dp[index]) { - dp[index] = candidateCost; - prev[index] = i; - } - } - } - - List segmentLimits = new ArrayList<>(); - int breakpointIndex = sizeColumn; - while (breakpointIndex > 0) { - segmentLimits.add(breakpointIndex); - breakpointIndex = prev[breakpointIndex]; - } - segmentLimits.add(0); - Collections.sort(segmentLimits); - return segmentLimits; - } - - public static double computeSegmentCost(double[] column, int start, int end) { - int n = end - start; - if (n <= 1) - return 0.0; - - double[] ab = regressSegment(column, start, end); - double slope = ab[0]; - double intercept = ab[1]; - - double sse = 0.0; - for (int i = start; i < end; i++) { - double x = i; - double y = column[i]; - double yhat = slope * x + intercept; - double diff = y - yhat; - sse += diff * diff; - } - return sse; // oder sse / n als MSE - } - public static double computeTotalSSE(double[] column, List breaks) { - double total = 0.0; - for (int s = 0; s < breaks.size() - 1; s++) { - int start = breaks.get(s); - int end = breaks.get(s + 1); - total += computeSegmentCost(column, start, end); // SSE des Segments - } - return total; - } - - - public static double[] regressSegment(double[] column, int start, int end) { - int n = end - start; - if (n <= 0) - return new double[] {0.0, 0.0}; - - double sumX = 0, sumY = 0, sumXX = 0, sumXY = 0; - for (int i = start; i < end; i++) { - double x = i; - double y = column[i]; - sumX += x; - sumY += y; - sumXX += x * x; - sumXY += x * y; - } - - double nD = n; - double denom = nD * sumXX - sumX * sumX; - double slope, intercept; - if (denom == 0) { - slope = 0.0; - intercept = sumY / nD; - } - else { - slope = (nD * sumXY - sumX * sumY) / denom; - intercept = (sumY - slope * sumX) / nD; - } - return new double[] {slope, intercept}; - } + public static AColGroup compressPiecewiseLinearFunctional(IColIndex colIndexes, MatrixBlock in, + CompressionSettings cs) { + + //Erstmal den Inhalt einer Spalte speichern + + int numRows = in.getNumRows(); + int colIdx = colIndexes.get(0); //Die erste Spalte + double[] column = getColumn(in, colIdx); + + //Sette den Targetloss + + // Breakpoints bestimmen: Einteilung der Segmente + + List breakpointsList = computeBreakpoints(cs, column); + int[] breakpoints = breakpointsList.stream().mapToInt(Integer::intValue).toArray(); + //Für jedes Segment lineare Regression als kompressionsverfahren + + // 3) Pro Segment Regression -> a,b + int numSeg = breakpoints.length - 1; + double[] slopes = new double[numSeg]; + double[] intercepts = new double[numSeg]; + + for(int s = 0; s < numSeg; s++) { + int start = breakpoints[s]; + int end = breakpoints[s + 1]; + + double[] ab = regressSegment(column, start, end); // nutzt gleiche Stats wie computeSegmentCost + slopes[s] = ab[0]; + intercepts[s] = ab[1]; + } + //Erstelle die Datenstruktur: PiecewiseLinearColGroupCompressed + + return ColGroupPiecewiseLinearCompressed.create(colIndexes, breakpoints, slopes, intercepts, numRows); + } + + public static double[] getColumn(MatrixBlock in, int colIndex) { + int numRows = in.getNumRows(); // Anzahl der Zeilen [web:16] + double[] column = new double[numRows]; // Variable für die Spalte + + for(int r = 0; r < numRows; r++) { + column[r] = in.get(r, colIndex); // Wert (r, colIndex) lesen [web:16][web:25] + } + return column; + } + + public static List computeBreakpoints(CompressionSettings cs, double[] column) { + int n = column.length; + double targetMSE = cs.getPiecewiseTargetLoss(); + // Fall A: kein TargetLoss angegeben -> einfache Variante mit fixem λ + if(Double.isNaN(targetMSE) || targetMSE <= 0) { + double lambda = 5.0; + return computeBreakpointsLambda(column, lambda); + } + + // Fall B: TargetLoss gesetzt -> globales Fehlerbudget berücksichtigen + double sseMax = n * targetMSE; // MSE -> SSE-Budget + + double lambdaMin = 0.0; // viele Segmente, minimaler Fehler + double lambdaMax = 1e6; // wenige Segmente, mehr Fehler + + List bestBreaks = null; + for(int it = 0; it < 20; it++) { // Binärsuche auf λ + double lambda = 0.5 * (lambdaMin + lambdaMax); + List breaks = computeBreakpointsLambda(column, lambda); + double totalSSE = computeTotalSSE(column, breaks); + if(totalSSE <= sseMax) { + // Budget eingehalten: wir können versuchen, mit größerem λ noch weniger Segmente zu nehmen + bestBreaks = breaks; + lambdaMin = lambda; + } + else { + // Fehler zu groß: λ verkleinern, mehr Segmente zulassen + lambdaMax = lambda; + } + } + + if(bestBreaks == null) + bestBreaks = computeBreakpointsLambda(column, lambdaMin); + + return bestBreaks; + } + + public static List computeBreakpointsLambda(double[] column, double lambda) { + int sizeColumn = column.length; + double[] dp = new double[sizeColumn + 1]; + int[] prev = new int[sizeColumn + 1]; + + dp[0] = 0.0; + + for(int index = 1; index <= sizeColumn; index++) { + dp[index] = Double.POSITIVE_INFINITY; + for(int i = 0; i < index; i++) { // Segment [i, index) + double costCurrentSegment = computeSegmentCost(column, i, index); // SSE + double candidateCost = dp[i] + costCurrentSegment + lambda; + if(candidateCost < dp[index]) { + dp[index] = candidateCost; + prev[index] = i; + } + } + } + + List segmentLimits = new ArrayList<>(); + int breakpointIndex = sizeColumn; + while(breakpointIndex > 0) { + segmentLimits.add(breakpointIndex); + breakpointIndex = prev[breakpointIndex]; + } + segmentLimits.add(0); + Collections.sort(segmentLimits); + return segmentLimits; + } + + public static double computeSegmentCost(double[] column, int start, int end) { + int n = end - start; + if(n <= 1) + return 0.0; + + double[] ab = regressSegment(column, start, end); + double slope = ab[0]; + double intercept = ab[1]; + + double sse = 0.0; + for(int i = start; i < end; i++) { + double x = i; + double y = column[i]; + double yhat = slope * x + intercept; + double diff = y - yhat; + sse += diff * diff; + } + return sse; // oder sse / n als MSE + } + + public static double computeTotalSSE(double[] column, List breaks) { + double total = 0.0; + for(int s = 0; s < breaks.size() - 1; s++) { + int start = breaks.get(s); + int end = breaks.get(s + 1); + total += computeSegmentCost(column, start, end); // SSE des Segments + } + return total; + } + + public static double[] regressSegment(double[] column, int start, int end) { + int n = end - start; + if(n <= 0) + return new double[] {0.0, 0.0}; + + double sumX = 0, sumY = 0, sumXX = 0, sumXY = 0; + for(int i = start; i < end; i++) { + double x = i; + double y = column[i]; + sumX += x; + sumY += y; + sumXX += x * x; + sumXY += x * y; + } + + double nD = n; + double denom = nD * sumXX - sumX * sumX; + double slope, intercept; + if(denom == 0) { + slope = 0.0; + intercept = sumY / nD; + } + else { + slope = (nD * sumXY - sumX * sumY) / denom; + intercept = (sumY - slope * sumX) / nD; + } + return new double[] {slope, intercept}; + } private AColGroup compressSDCFromSparseTransposedBlock(IColIndex cols, int nrUniqueEstimate, double tupleSparsity) { if(cols.size() > 1) diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/scheme/ColGroupPiecewiseLinearCompressed.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/scheme/ColGroupPiecewiseLinearCompressed.java index 4062c4da611..1f39dc44cb0 100644 --- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/scheme/ColGroupPiecewiseLinearCompressed.java +++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/scheme/ColGroupPiecewiseLinearCompressed.java @@ -21,381 +21,372 @@ public class ColGroupPiecewiseLinearCompressed extends AColGroupCompressed { - IColIndex colIndexes; - int[] breakpoints; - double[] slopes; - double[] intercepts; - int numRows; + IColIndex colIndexes; + int[] breakpoints; + double[] slopes; + double[] intercepts; + int numRows; - protected ColGroupPiecewiseLinearCompressed(IColIndex colIndices) { - super(colIndices); - } + protected ColGroupPiecewiseLinearCompressed(IColIndex colIndices) { + super(colIndices); + } + public ColGroupPiecewiseLinearCompressed(IColIndex colIndexes, int[] breakpoints, double[] slopes, + double[] intercepts, int numRows) { + super(colIndexes); + this.breakpoints = breakpoints; + this.slopes = slopes; + this.intercepts = intercepts; + this.numRows = numRows; + } - public ColGroupPiecewiseLinearCompressed(IColIndex colIndexes, int[] breakpoints, double[] slopes, double[] intercepts, int numRows) { - super(colIndexes); - this.breakpoints = breakpoints; - this.slopes = slopes; - this.intercepts = intercepts; - this.numRows = numRows; - } + public static AColGroup create(IColIndex colIndexes, int[] breakpoints, double[] slopes, double[] intercepts, + int numRows) { + if(breakpoints == null || breakpoints.length < 2) + throw new IllegalArgumentException("Need at least one segment"); + int numSeg = breakpoints.length - 1; + if(slopes.length != numSeg || intercepts.length != numSeg) + throw new IllegalArgumentException("Inconsistent segment arrays"); - public static AColGroup create(IColIndex colIndexes, int[] breakpoints, double[] slopes, double[] intercepts, int numRows) { - if (breakpoints == null || breakpoints.length < 2) - throw new IllegalArgumentException("Need at least one segment"); + int[] bpCopy = Arrays.copyOf(breakpoints, breakpoints.length); + double[] slopeCopy = Arrays.copyOf(slopes, slopes.length); + double[] interceptCopy = Arrays.copyOf(intercepts, intercepts.length); - int numSeg = breakpoints.length - 1; - if (slopes.length != numSeg || intercepts.length != numSeg) - throw new IllegalArgumentException("Inconsistent segment arrays"); + return new ColGroupPiecewiseLinearCompressed(colIndexes, bpCopy, slopeCopy, interceptCopy, numRows); - int[] bpCopy = Arrays.copyOf(breakpoints, breakpoints.length); - double[] slopeCopy = Arrays.copyOf(slopes, slopes.length); - double[] interceptCopy = Arrays.copyOf(intercepts, intercepts.length); + } + @Override + public void decompressToDenseBlock(DenseBlock db, int rl, int ru, int offR, int offC) { - return new ColGroupPiecewiseLinearCompressed( - colIndexes, - bpCopy, - slopeCopy, - interceptCopy, - numRows); + if(db == null || _colIndexes == null || _colIndexes.size() == 0 || breakpoints == null || slopes == null || + intercepts == null) { + return; + } - } + int numSeg = breakpoints.length - 1; + if(numSeg <= 0 || rl >= ru) { + return; + } - @Override - public void decompressToDenseBlock(DenseBlock db, int rl, int ru, int offR, int offC) { + final int col = _colIndexes.get(0); - if (db == null || _colIndexes == null || _colIndexes.size() == 0 || - breakpoints == null || slopes == null || intercepts == null) { - return; - } + for(int s = 0; s < numSeg; s++) { + int segStart = breakpoints[s]; + int segEnd = breakpoints[s + 1]; + if(segStart >= segEnd) + continue; // Invalid Segment - int numSeg = breakpoints.length - 1; - if (numSeg <= 0 || rl >= ru) { - return; - } + double a = slopes[s]; + double b = intercepts[s]; - final int col = _colIndexes.get(0); + int rs = Math.max(segStart, rl); + int re = Math.min(segEnd, ru); + if(rs >= re) + continue; - for (int s = 0; s < numSeg; s++) { - int segStart = breakpoints[s]; - int segEnd = breakpoints[s + 1]; - if (segStart >= segEnd) continue; // Invalid Segment + for(int r = rs; r < re; r++) { + double yhat = a * r + b; + int gr = offR + r; + int gc = offC + col; - double a = slopes[s]; - double b = intercepts[s]; + if(gr >= 0 && gr < db.numRows() && gc >= 0 && gc < db.numCols()) { + db.set(gr, gc, yhat); + } + } + } + } - int rs = Math.max(segStart, rl); - int re = Math.min(segEnd, ru); - if (rs >= re) continue; + @Override + protected double computeMxx(double c, Builtin builtin) { + return 0; + } - for (int r = rs; r < re; r++) { - double yhat = a * r + b; - int gr = offR + r; - int gc = offC + col; + @Override + protected void computeColMxx(double[] c, Builtin builtin) { + } - if (gr >= 0 && gr < db.numRows() && gc >= 0 && gc < db.numCols()) { - db.set(gr, gc, yhat); - } - } - } - } + @Override + protected void computeSum(double[] c, int nRows) { - @Override - protected double computeMxx(double c, Builtin builtin) { - return 0; - } + } - @Override - protected void computeColMxx(double[] c, Builtin builtin) { + @Override + protected void computeSumSq(double[] c, int nRows) { - } + } - @Override - protected void computeSum(double[] c, int nRows) { + @Override + protected void computeColSumsSq(double[] c, int nRows) { - } + } - @Override - protected void computeSumSq(double[] c, int nRows) { + @Override + protected void computeRowSums(double[] c, int rl, int ru, double[] preAgg) { - } + } - @Override - protected void computeColSumsSq(double[] c, int nRows) { + @Override + protected void computeRowMxx(double[] c, Builtin builtin, int rl, int ru, double[] preAgg) { - } + } - @Override - protected void computeRowSums(double[] c, int rl, int ru, double[] preAgg) { + @Override + protected void computeProduct(double[] c, int nRows) { - } + } - @Override - protected void computeRowMxx(double[] c, Builtin builtin, int rl, int ru, double[] preAgg) { + @Override + protected void computeRowProduct(double[] c, int rl, int ru, double[] preAgg) { - } + } - @Override - protected void computeProduct(double[] c, int nRows) { + @Override + protected void computeColProduct(double[] c, int nRows) { - } + } - @Override - protected void computeRowProduct(double[] c, int rl, int ru, double[] preAgg) { + @Override + protected double[] preAggSumRows() { + return new double[0]; + } - } + @Override + protected double[] preAggSumSqRows() { + return new double[0]; + } - @Override - protected void computeColProduct(double[] c, int nRows) { + @Override + protected double[] preAggProductRows() { + return new double[0]; + } - } + @Override + protected double[] preAggBuiltinRows(Builtin builtin) { + return new double[0]; + } - @Override - protected double[] preAggSumRows() { - return new double[0]; - } + @Override + public boolean sameIndexStructure(AColGroupCompressed that) { + return false; + } - @Override - protected double[] preAggSumSqRows() { - return new double[0]; - } + @Override + protected void tsmm(double[] result, int numColumns, int nRows) { - @Override - protected double[] preAggProductRows() { - return new double[0]; - } + } - @Override - protected double[] preAggBuiltinRows(Builtin builtin) { - return new double[0]; - } + @Override + public AColGroup copyAndSet(IColIndex colIndexes) { + return null; + } - @Override - public boolean sameIndexStructure(AColGroupCompressed that) { - return false; - } + @Override + public void decompressToDenseBlockTransposed(DenseBlock db, int rl, int ru) { - @Override - protected void tsmm(double[] result, int numColumns, int nRows) { + } - } + @Override + public void decompressToSparseBlockTransposed(SparseBlockMCSR sb, int nColOut) { - @Override - public AColGroup copyAndSet(IColIndex colIndexes) { - return null; - } + } - @Override - public void decompressToDenseBlockTransposed(DenseBlock db, int rl, int ru) { + @Override + public double getIdx(int r, int colIdx) { + // ✅ CRUCIAL: Bounds-Check für colIdx! + if(r < 0 || r >= numRows || colIdx < 0 || colIdx >= _colIndexes.size()) { + return 0.0; + } - } + // Segment-Suche (sicher jetzt) + int seg = 0; + for(int i = 1; i < breakpoints.length; i++) { + if(r < breakpoints[i]) { + break; + } + seg = i - 1; // seg < numSeg immer! + } - @Override - public void decompressToSparseBlockTransposed(SparseBlockMCSR sb, int nColOut) { + return slopes[seg] * (double) r + intercepts[seg]; + } - } + @Override + public int getNumValues() { + return breakpoints.length + slopes.length + intercepts.length; + } - @Override - public double getIdx(int r, int colIdx) { - // ✅ CRUCIAL: Bounds-Check für colIdx! - if (r < 0 || r >= numRows || colIdx < 0 || colIdx >= _colIndexes.size()) { - return 0.0; - } + @Override + public CompressionType getCompType() { + return null; + } - // Segment-Suche (sicher jetzt) - int seg = 0; - for (int i = 1; i < breakpoints.length; i++) { - if (r < breakpoints[i]) { - break; - } - seg = i - 1; // seg < numSeg immer! - } + @Override + protected ColGroupType getColGroupType() { + return null; + } - return slopes[seg] * (double) r + intercepts[seg]; - } + @Override + public void decompressToSparseBlock(SparseBlock sb, int rl, int ru, int offR, int offC) { - @Override - public int getNumValues() { - return breakpoints.length + slopes.length + intercepts.length; - } + } - @Override - public CompressionType getCompType() { - return null; - } + @Override + public AColGroup rightMultByMatrix(MatrixBlock right, IColIndex allCols, int k) { + return null; + } - @Override - protected ColGroupType getColGroupType() { - return null; - } + @Override + public void leftMultByMatrixNoPreAgg(MatrixBlock matrix, MatrixBlock result, int rl, int ru, int cl, int cu) { + + } + + @Override + public void leftMultByAColGroup(AColGroup lhs, MatrixBlock result, int nRows) { + + } + + @Override + public void tsmmAColGroup(AColGroup other, MatrixBlock result) { + + } + + @Override + public AColGroup scalarOperation(ScalarOperator op) { + return null; + } + + @Override + public AColGroup binaryRowOpLeft(BinaryOperator op, double[] v, boolean isRowSafe) { + return null; + } + + @Override + public AColGroup binaryRowOpRight(BinaryOperator op, double[] v, boolean isRowSafe) { + return null; + } + + @Override + protected AColGroup sliceSingleColumn(int idx) { + return null; + } + @Override + protected AColGroup sliceMultiColumns(int idStart, int idEnd, IColIndex outputCols) { + return null; + } + + @Override + public AColGroup sliceRows(int rl, int ru) { + return null; + } + + @Override + public boolean containsValue(double pattern) { + return false; + } + + @Override + public long getNumberNonZeros(int nRows) { + return 0; + } + + @Override + public AColGroup replace(double pattern, double replace) { + return null; + } + + @Override + public void computeColSums(double[] c, int nRows) { + + } + + @Override + public CmCovObject centralMoment(CMOperator op, int nRows) { + return null; + } + + @Override + public AColGroup rexpandCols(int max, boolean ignore, boolean cast, int nRows) { + return null; + } + + @Override + public double getCost(ComputationCostEstimator e, int nRows) { + return 0; + } + + @Override + public AColGroup unaryOperation(UnaryOperator op) { + return null; + } + @Override + public AColGroup append(AColGroup g) { + return null; + } - @Override - public void decompressToSparseBlock(SparseBlock sb, int rl, int ru, int offR, int offC) { - - } - - @Override - public AColGroup rightMultByMatrix(MatrixBlock right, IColIndex allCols, int k) { - return null; - } - - @Override - public void leftMultByMatrixNoPreAgg(MatrixBlock matrix, MatrixBlock result, int rl, int ru, int cl, int cu) { - - } - - @Override - public void leftMultByAColGroup(AColGroup lhs, MatrixBlock result, int nRows) { - - } - - @Override - public void tsmmAColGroup(AColGroup other, MatrixBlock result) { - - } - - @Override - public AColGroup scalarOperation(ScalarOperator op) { - return null; - } - - @Override - public AColGroup binaryRowOpLeft(BinaryOperator op, double[] v, boolean isRowSafe) { - return null; - } - - @Override - public AColGroup binaryRowOpRight(BinaryOperator op, double[] v, boolean isRowSafe) { - return null; - } - - @Override - protected AColGroup sliceSingleColumn(int idx) { - return null; - } - - @Override - protected AColGroup sliceMultiColumns(int idStart, int idEnd, IColIndex outputCols) { - return null; - } - - @Override - public AColGroup sliceRows(int rl, int ru) { - return null; - } - - @Override - public boolean containsValue(double pattern) { - return false; - } - - @Override - public long getNumberNonZeros(int nRows) { - return 0; - } - - @Override - public AColGroup replace(double pattern, double replace) { - return null; - } - - @Override - public void computeColSums(double[] c, int nRows) { - - } - - @Override - public CmCovObject centralMoment(CMOperator op, int nRows) { - return null; - } - - - @Override - public AColGroup rexpandCols(int max, boolean ignore, boolean cast, int nRows) { - return null; - } - - @Override - public double getCost(ComputationCostEstimator e, int nRows) { - return 0; - } - - @Override - public AColGroup unaryOperation(UnaryOperator op) { - return null; - } - - @Override - public AColGroup append(AColGroup g) { - return null; - } - - @Override - protected AColGroup appendNInternal(AColGroup[] groups, int blen, int rlen) { - return null; - } - - @Override - public ICLAScheme getCompressionScheme() { - return null; - } - - @Override - public AColGroup recompress() { - return null; - } - - @Override - public CompressedSizeInfoColGroup getCompressionInfo(int nRow) { - return null; - } - - @Override - protected AColGroup fixColIndexes(IColIndex newColIndex, int[] reordering) { - return null; - } - - @Override - public AColGroup reduceCols() { - return null; - } - - @Override - public double getSparsity() { - return 0; - } - - @Override - protected void sparseSelection(MatrixBlock selection, ColGroupUtils.P[] points, MatrixBlock ret, int rl, int ru) { - - } - - @Override - protected void denseSelection(MatrixBlock selection, ColGroupUtils.P[] points, MatrixBlock ret, int rl, int ru) { - - } - - @Override - public AColGroup[] splitReshape(int multiplier, int nRow, int nColOrg) { - return new AColGroup[0]; - } - - public int[] getBreakpoints() { - return breakpoints; - } - - public double[] getSlopes() { - return slopes; - } - - - public double[] getIntercepts() { - return intercepts; - } + @Override + protected AColGroup appendNInternal(AColGroup[] groups, int blen, int rlen) { + return null; + } + + @Override + public ICLAScheme getCompressionScheme() { + return null; + } + + @Override + public AColGroup recompress() { + return null; + } + + @Override + public CompressedSizeInfoColGroup getCompressionInfo(int nRow) { + return null; + } + + @Override + protected AColGroup fixColIndexes(IColIndex newColIndex, int[] reordering) { + return null; + } + + @Override + public AColGroup reduceCols() { + return null; + } + + @Override + public double getSparsity() { + return 0; + } + + @Override + protected void sparseSelection(MatrixBlock selection, ColGroupUtils.P[] points, MatrixBlock ret, int rl, int ru) { + + } + + @Override + protected void denseSelection(MatrixBlock selection, ColGroupUtils.P[] points, MatrixBlock ret, int rl, int ru) { + + } + + @Override + public AColGroup[] splitReshape(int multiplier, int nRow, int nColOrg) { + return new AColGroup[0]; + } + + public int[] getBreakpoints() { + return breakpoints; + } + + public double[] getSlopes() { + return slopes; + } + + public double[] getIntercepts() { + return intercepts; + } } diff --git a/src/test/java/org/apache/sysds/runtime/compress/colgroup/ColGroupPiecewiseLinearCompressedTest.java b/src/test/java/org/apache/sysds/runtime/compress/colgroup/ColGroupPiecewiseLinearCompressedTest.java index 5b3688be5b1..4f309fda967 100644 --- a/src/test/java/org/apache/sysds/runtime/compress/colgroup/ColGroupPiecewiseLinearCompressedTest.java +++ b/src/test/java/org/apache/sysds/runtime/compress/colgroup/ColGroupPiecewiseLinearCompressedTest.java @@ -11,696 +11,689 @@ import org.apache.sysds.runtime.data.DenseBlock; import org.apache.sysds.runtime.matrix.data.MatrixBlock; import org.junit.Test; + import java.util.Arrays; import java.util.List; + import static org.apache.sysds.runtime.compress.colgroup.ColGroupFactory.*; import static org.junit.Assert.*; - public class ColGroupPiecewiseLinearCompressedTest { + @Test + public void testComputeBreakpoints_uniformColumn() { + CompressionSettings cs = new CompressionSettingsBuilder().create(); + cs.setPiecewiseTargetLoss(1e-3); + double[] column = {1.0, 1.0, 1.0, 1.0, 1.0}; // ← Test-spezifisch + List breaks = computeBreakpoints(cs, column); + assertEquals(Arrays.asList(0, 5), breaks); // Erwartet: keine Breaks + } + + @Test + public void testComputeBreakpoints_linearIncreasing() { + CompressionSettings cs = new CompressionSettingsBuilder().create(); + cs.setPiecewiseTargetLoss(1e-3); + double[] column = {0.0, 1.0, 2.0, 3.0, 4.0}; // ← andere column + List breaks = computeBreakpoints(cs, column); + assertEquals(Arrays.asList(0, 5), breaks); // Erwartet + + } + + @Test + public void testComputeBreakpoints_highLoss_uniform() { + CompressionSettings cs = new CompressionSettingsBuilder().create(); + cs.setPiecewiseTargetLoss(10000.0); + double[] column = {1.0, 1.0, 1.0, 1.0, 1.0}; + List breaks = computeBreakpoints(cs, column); + assertEquals(Arrays.asList(0, 5), breaks); + } + + @Test + public void testComputeBreakpoints_twoSegments() { + CompressionSettings cs = new CompressionSettingsBuilder().create(); + cs.setPiecewiseTargetLoss(1e-3); + // {1,1,1, 2,2,2} → 2 Segmente → [0,3,6] + double[] column = {1.0, 1.0, 1.0, 2.0, 2.0, 2.0}; + var breaks = computeBreakpoints(cs, column); + assertEquals(Arrays.asList(0, 3, 6), breaks); + } + + @Test + public void testComputeBreakpoints_noLoss_linear() { + CompressionSettings cs = new CompressionSettingsBuilder().create(); + cs.setPiecewiseTargetLoss(0.0); + //cs.setPiecewiseTargetLoss(0.0); + double[] column = {0.0, 1.0, 2.0, 3.0, 4.0}; + List breaks = computeBreakpoints(cs, column); + assertEquals(Arrays.asList(0, 5), breaks); // bei 0 Loss alle Breaks + } + + @Test + public void testComputeBreakpointsLambda_const() { + double[] column = {1.0, 1.0, 1.0, 1.0, 1.0}; + List breaks = computeBreakpointsLambda(column, 5.0); + assertEquals(Arrays.asList(0, 5), breaks); + + breaks = computeBreakpointsLambda(column, 0.01); + assertEquals(Arrays.asList(0, 5), breaks); + } + + @Test + public void testComputeBreakpointsLambda_twoSegments() { + double[] column = {1.0, 1.0, 1.0, 2.0, 2.0, 2.0}; // 6 Werte + + // mit kleinem lambda -> viele Segmente (kostenlos fast) + List breaks = computeBreakpointsLambda(column, 0.01); + assertTrue(breaks.contains(3)); + assertEquals(3, breaks.size()); + assertEquals(Arrays.asList(0, 3, 6), breaks); + + // mit großem lambda entspricht nur ein Segment + breaks = computeBreakpointsLambda(column, 1000.0); + assertEquals(Arrays.asList(0, 6), breaks); + } + + @Test + public void testComputeBreakpointsLambda_jumpWithTrend() { + double[] column = {0.0, 1.0, 2.0, 10.0, 11.0, 12.0}; + + // grobe Segmentanpassung: ein Segment pro „Abschnitt“ + List breaks = computeBreakpointsLambda(column, 0.5); + assertEquals(Arrays.asList(0, 3, 6), breaks); + + // nur ein Segment, wenn lambda sehr groß + breaks = computeBreakpointsLambda(column, 100.0); + assertEquals(Arrays.asList(0, 6), breaks); + } + + @Test + public void testComputeBreakpointsLambda_linear() { + double[] column = {0.0, 1.0, 2.0, 3.0, 4.0, 5.0}; + + List breaks = computeBreakpointsLambda(column, 1.0); + assertEquals(Arrays.asList(0, 6), breaks); + + // mit sehr kleinem lambda: wir prüfen nur, dass die Grenzen vernünftig sind + breaks = computeBreakpointsLambda(column, 0.001); + assertTrue(breaks.size() >= 2); + assertTrue(breaks.get(0) == 0); + assertTrue(breaks.get(breaks.size() - 1) == column.length); + } + + @Test + public void testComputeBreakpointsLambda_edge_lambdaVerySmall() { + double[] column = {1.0, 1.1, 1.0, 1.1, 1.0}; + + List breaks = computeBreakpointsLambda(column, 0.001); + assertNotNull(breaks); + assertFalse(breaks.isEmpty()); + assertEquals(0, (int) breaks.get(0)); + assertEquals(column.length, (int) breaks.get(breaks.size() - 1)); + + // Prüfe, dass die Liste sortiert ist + for(int i = 1; i < breaks.size(); i++) { + assertTrue(breaks.get(i) >= breaks.get(i - 1)); + } + } + + @Test + public void testComputeBreakpointsLambda_edge_lambdaVeryLarge() { + double[] column = {1.0, 2.0, 1.5, 2.5, 1.8}; + + List breaks = computeBreakpointsLambda(column, 1000.0); + assertEquals(Arrays.asList(0, 5), breaks); + } + + @Test + public void testComputeSegmentCost_emptyOrSingle() { + double[] column = {10.0, 20.0, 30.0}; + + // 0 Elemente (leer) + assertEquals(0.0, computeSegmentCost(column, 0, 0), 1e-10); + assertEquals(0.0, computeSegmentCost(column, 1, 1), 1e-10); + + // 1 Element → Regressionsgerade ist nicht eindeutig definiert, aber SSE=0 + assertEquals(0.0, computeSegmentCost(column, 0, 1), 1e-10); + assertEquals(0.0, computeSegmentCost(column, 1, 2), 1e-10); + assertEquals(0.0, computeSegmentCost(column, 2, 3), 1e-10); + } + + @Test + public void testComputeSegmentCost_twoConstantPoints() { + double[] column = {5.0, 5.0, 1.0, 1.0}; + + // Zwei identische Punkte (konstant) → SSE = 0 + double sse = computeSegmentCost(column, 0, 2); + assertEquals(0.0, sse, 1e-10); + } + + @Test + public void testComputeSegmentCost_twoDifferentPoints() { + double[] column = {0.0, 2.0, 1.0, 3.0}; + + // Zwei Punkte: (0,0) und (1,2) → Gerade y = 2*x, Fehler = 0 + double sse = computeSegmentCost(column, 0, 2); + assertEquals(0.0, sse, 1e-10); + + // Zwei Punkte: (2,1) und (3,3) → Gerade y = 2*x - 3, Fehler = 0 + sse = computeSegmentCost(column, 2, 4); + assertEquals(0.0, sse, 1e-10); + } + + @Test + public void testComputeSegmentCost_constantThree() { + double[] column = {0.0, 0.0, 0.0}; + double sse = computeSegmentCost(column, 0, 3); + assertEquals(0.0, sse, 1e-10); + } + + @Test + public void testComputeSegmentCost_consistent_with_regression() { + double[] column = {0.0, 2.0, 0.0, 4.0, 0.0, 6.0}; + + int start = 0, end = 3; + double[] ab = regressSegment(column, start, end); + double slope = ab[0], intercept = ab[1]; + double sse_hand = 0.0; + for(int i = start; i < end; i++) { + double yhat = slope * i + intercept; + double diff = column[i] - yhat; + sse_hand += diff * diff; + } + + double sse = computeSegmentCost(column, start, end); + assertEquals(sse_hand, sse, 1e-10); + } + + @Test + public void testComputeTotalSSE_emptyBreaks() { + double[] column = {1.0, 2.0, 3.0}; + List breaks = Arrays.asList(); // leer → keine Segmente + double total = computeTotalSSE(column, breaks); + + // 0 Segmente → Summe über 0 Segmente = 0 + assertEquals(0.0, total, 1e-10); + } + + @Test + public void testComputeTotalSSE_singleSegment_all() { + double[] column = {1.0, 2.0, 3.0}; + List breaks = Arrays.asList(0, 3); // ein Segment [0,3) + + double total = computeTotalSSE(column, breaks); + double expected = computeSegmentCost(column, 0, 3); + + // Ergebnis muss exakt das gleiche wie der SSE des gesamten Segments sein + assertEquals(expected, total, 1e-10); + } + + @Test + public void testComputeTotalSSE_twoSegments() { + // Beispiel: [0,0,0] und [1,1,1] (jeweils konstant) + double[] column = {0.0, 0.0, 0.0, 1.0, 1.0, 1.0}; + List breaks = Arrays.asList(0, 3, 6); // zwei Segmente + + double total = computeTotalSSE(column, breaks); + double sse1 = computeSegmentCost(column, 0, 3); // [0,0,0] → SSE = 0 + double sse2 = computeSegmentCost(column, 3, 6); // [1,1,1] → SSE = 0 + + // da beide Segmente konstant sind, muss totalSSE = 0 sein + assertEquals(0.0, total, 1e-10); + assertEquals(sse1 + sse2, total, 1e-10); + } + + @Test + public void testComputeTotalSSE_threeSegments() { + // Ein Segment mit drei identischen Werten, zwei Segmente mit jeweils zwei Werten + double[] column = {1.0, 1.0, 1.0, 2.0, 2.0, 3.0, 3.0}; + List breaks = Arrays.asList(0, 3, 5, 7); + + // Segment [0,3): konstant 1.0 → SSE = 0 + double sse1 = computeSegmentCost(column, 0, 3); // 0 + + // Segment [3,5): [2,2] → SSE = 0 + double sse2 = computeSegmentCost(column, 3, 5); // 0 + + // Segment [5,7): [3,3] → SSE = 0 + double sse3 = computeSegmentCost(column, 5, 7); // 0 + + double total = computeTotalSSE(column, breaks); + assertEquals(0.0, total, 1e-10); + assertEquals(sse1 + sse2 + sse3, total, 1e-10); + } + + @Test + public void testComputeTotalSSE_gapStartEnd() { + double[] column = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0}; + List breaks = Arrays.asList(2, 5, 8); + + double total = computeTotalSSE(column, breaks); + double sse1 = computeSegmentCost(column, 2, 5); + double sse2 = computeSegmentCost(column, 5, 8); + + assertEquals(sse1 + sse2, total, 1e-10); + + } + + @Test + public void testComputeTotalSSE_oneSegment_identical() { + double[] column = {1.0, 2.0, 3.0, 4.0, 5.0}; + double sseTotal = computeSegmentCost(column, 0, 5); + + List breaks = Arrays.asList(0, 5); + double total = computeTotalSSE(column, breaks); + + assertEquals(sseTotal, total, 1e-10); + } + + @Test + public void testComputeTotalSSE_nonConstant() { + double[] column = {0.0, 1.0, 2.0, 3.0, 4.0}; + List breaks = Arrays.asList(0, 2, 5); + + double total = computeTotalSSE(column, breaks); + double sse1 = computeSegmentCost(column, 0, 2); + double sse2 = computeSegmentCost(column, 2, 5); + + assertTrue(total >= 0.0); + assertEquals(sse1 + sse2, total, 1e-10); + } + + @Test + public void testComputeTotalSSE_edgeCases() { + double[] columnEmpty = {}; + List breaksEmpty = Arrays.asList(0, 0); + assertEquals(0.0, computeTotalSSE(columnEmpty, breaksEmpty), 1e-10); + + double[] columnOne = {42.0}; + List breaksOne = Arrays.asList(0, 1); + double total = computeTotalSSE(columnOne, breaksOne); + assertEquals(0.0, total, 1e-10); + } + + @Test + public void testRegressSegment_empty() { + double[] column = {1.0, 2.0, 3.0}; + double[] result = regressSegment(column, 0, 0); + assertEquals(0.0, result[0], 1e-10); + assertEquals(0.0, result[1], 1e-10); + } + + @Test + public void testRegressSegment_singlePoint() { + double[] column = {1.0, 2.0, 3.0}; + double[] result = regressSegment(column, 1, 2); + + assertEquals(0.0, result[0], 1e-10); + assertEquals(2.0, result[1], 1e-10); + } + + @Test + public void testRegressSegment_twoIdentical() { + double[] column = {5.0, 5.0, 1.0, 1.0}; + double[] result = regressSegment(column, 0, 2); + + assertEquals(0.0, result[0], 1e-10); + assertEquals(5.0, result[1], 1e-10); + } + + @Test + public void testRegressSegment_twoPoints() { + double[] column = {0.0, 2.0}; + double[] result = regressSegment(column, 0, 2); + + assertEquals(2.0, result[0], 1e-10); + assertEquals(0.0, result[1], 1e-10); + } + + @Test + public void testRegressSegment_twoPoints_offset() { + + double[] column = {1.0, 3.0, 5.0, 7.0}; + double[] result = regressSegment(column, 2, 4); + + assertEquals(2.0, result[0], 1e-10); + assertEquals(1.0, result[1], 1e-10); + } + + @Test + public void testRegressSegment_constant() { + double[] column = {3.0, 3.0, 3.0, 3.0}; + double[] result = regressSegment(column, 0, 4); + + assertEquals(0.0, result[0], 1e-10); + assertEquals(3.0, result[1], 1e-10); + } + + @Test + public void testRegressSegment_linear() { + double[] column = new double[4]; + double a = 1.5, b = 2.0; + for(int i = 0; i < 4; i++) { + column[i] = a * i + b; + } + + double[] result = regressSegment(column, 0, 4); + + assertEquals(a, result[0], 1e-10); + assertEquals(b, result[1], 1e-10); + } + + @Test + public void testRegressSegment_denomZero() { + double[] column = {10.0}; + double[] result = regressSegment(column, 0, 1); + + assertEquals(0.0, result[0], 1e-10); + assertEquals(10.0, result[1], 1e-10); + } + + @Test + public void testCompressPiecewiseLinearFunctional_const() { + // 1. MatrixBlock mit einer konstanten Spalte erzeugen + int nrows = 20, ncols = 1; + MatrixBlock in = new MatrixBlock(nrows, ncols, false); + for(int r = 0; r < nrows; r++) + in.set(r, 0, 1.0); + // 2. colIndexes für Spalte 0 + IColIndex colIndexes = ColIndexFactory.create(new int[] {0}); + // 3. CompressionSettings mit TargetLoss + CompressionSettings cs = new CompressionSettingsBuilder().create(); + cs.setPiecewiseTargetLoss(1e-6); + // 4. Aufruf der Kompressionsfunktion + AColGroup result = ColGroupFactory.compressPiecewiseLinearFunctional(colIndexes, in, cs); + + // 5. Ergebnis ist eine ColGroupPiecewiseLinearCompressed? + assertTrue(result instanceof ColGroupPiecewiseLinearCompressed); + ColGroupPiecewiseLinearCompressed plGroup = (ColGroupPiecewiseLinearCompressed) result; + + // 6. Breakpoints per Getter, nicht per create() + int[] breakpoints = plGroup.getBreakpoints(); + assertArrayEquals(new int[] {0, 20}, breakpoints); + + // 7. Pro Segment: 1 Segment → ein slope, ein intercept + double[] slopes = plGroup.getSlopes(); + double[] intercepts = plGroup.getIntercepts(); + assertEquals(1, slopes.length); + assertEquals(1, intercepts.length); + + // 8. Für konstante Daten: Steigung ~0, intercept ~1.0 + assertEquals(0.0, slopes[0], 1e-10); + assertEquals(1.0, intercepts[0], 1e-10); + + // 9. Check: colIndexes stimmt + IColIndex idx = plGroup.getColIndices(); + assertEquals(1, idx.size()); + assertEquals(0, idx.get(0)); + } + + @Test(expected = IllegalArgumentException.class) + public void testCreate_nullBreakpoints() { + int[] nullBp = null; + ColGroupPiecewiseLinearCompressed.create(ColIndexFactory.create(new int[] {0}), nullBp, new double[] {1.0}, + new double[] {0.0}, 10); + } + + @Test(expected = IllegalArgumentException.class) + public void testCreate_tooFewBreakpoints() { + int[] singleBp = {0}; + ColGroupPiecewiseLinearCompressed.create(ColIndexFactory.create(new int[] {0}), singleBp, new double[] {1.0}, + new double[] {0.0}, 10); + } + + @Test(expected = IllegalArgumentException.class) + public void testCreate_inconsistentSlopes() { + int[] bp = {0, 5, 10}; + ColGroupPiecewiseLinearCompressed.create(ColIndexFactory.create(new int[] {0}), bp, + new double[] {1.0, 2.0, 3.0}, new double[] {0.0, 1.0}, 10); + } + + @Test(expected = IllegalArgumentException.class) + public void testCreate_inconsistentIntercepts() { + int[] bp = {0, 5, 10}; + ColGroupPiecewiseLinearCompressed.create(ColIndexFactory.create(new int[] {0}), bp, new double[] {1.0, 2.0}, + new double[] {0.0}, 10); + } + + @Test + public void testCreate_validMultiSegment() { + int[] bp = {0, 3, 7, 10}; + double[] slopes = {1.0, -2.0, 0.5}; + double[] intercepts = {0.0, 5.0, -1.0}; + IColIndex cols = ColIndexFactory.create(new int[] {0, 1}); + + AColGroup cg = ColGroupPiecewiseLinearCompressed.create(cols, bp, slopes, intercepts, 10); + + assertTrue(cg instanceof ColGroupPiecewiseLinearCompressed); + assertNotSame(bp, ((ColGroupPiecewiseLinearCompressed) cg).getBreakpoints()); + } + + @Test + public void testCreate_multiColumn() { + IColIndex cols = ColIndexFactory.create(new int[] {5, 10, 15}); + int[] bp = {0, 5}; + double[] slopes = {3.0}; + double[] intercepts = {2.0}; + + AColGroup cg = ColGroupPiecewiseLinearCompressed.create(cols, bp, slopes, intercepts, 100); + assertTrue(cg instanceof ColGroupPiecewiseLinearCompressed); + assertTrue(cg instanceof ColGroupPiecewiseLinearCompressed); + + // + assertTrue(cg.getNumValues() > 0); + + for(int r = 0; r < 5; r++) { + double expected = 3.0 * r + 2.0; + // colIdx=0 → globale Spalte 5 + assertEquals(expected, cg.getIdx(r, 0), 1e-9); + // colIdx=1 → globale Spalte 10 + assertEquals(expected, cg.getIdx(r, 1), 1e-9); + // colIdx=2 → globale Spalte 15 + assertEquals(expected, cg.getIdx(r, 2), 1e-9); + } + + for(int r = 5; r < 10; r++) { + double expected = 3.0 * r + 2.0; + assertEquals(expected, cg.getIdx(r, 0), 1e-9); // Alle Columns gleich + } + assertEquals(cols.size(), 3); + } + + @Test + public void testCreate_singleColumn() { + IColIndex cols = ColIndexFactory.create(new int[] {5}); + int[] bp = {0, 5}; + double[] slopes = {3.0}; + double[] intercepts = {2.0}; + int numRows = 10; + + AColGroup cg = ColGroupPiecewiseLinearCompressed.create(cols, bp, slopes, intercepts, numRows); + + assertTrue(cg instanceof ColGroupPiecewiseLinearCompressed); + + assertEquals(2.0, cg.getIdx(0, 0), 1e-9); // 3*0 + 2 + assertEquals(5.0, cg.getIdx(1, 0), 1e-9); // 3*1 + 2 + } + + @Test + public void testCreate_validMinimal() { + + // 1 Segment: [0,10] → y = 2.0 * r + 1.0 + int[] bp = {0, 10}; + double[] slopes = {2.0}; + double[] intercepts = {1.0}; + IColIndex cols = ColIndexFactory.create(new int[] {0}); + int numRows = 10; + + AColGroup cg = ColGroupPiecewiseLinearCompressed.create(cols, bp, slopes, intercepts, numRows); + + // Korrekte Instanz + assertTrue(cg instanceof ColGroupPiecewiseLinearCompressed); + + // getNumValues() > 0 + assertTrue(cg.getNumValues() > 0); + + // r < numRows + for(int r = 0; r < numRows; r++) { + double expected = 2.0 * r + 1.0; + assertEquals("Row " + r, expected, cg.getIdx(r, 0), 1e-9); + } + + // Letzte gültige Row + assertEquals(19.0, cg.getIdx(9, 0), 1e-9); + + //Out-of-Bounds korrekt 0.0 + assertEquals(0.0, cg.getIdx(10, 0), 1e-9); + assertEquals(0.0, cg.getIdx(9, 1), 1e-9); + } + + @Test + public void testDecompressToDenseBlock() { + int[] bp = {0, 5, 10}; + double[] slopes = {1.0, 2.0}; + double[] intercepts = {0.0, 1.0}; + int numRows = 10; + + AColGroup cg = ColGroupPiecewiseLinearCompressed.create(ColIndexFactory.create(new int[] {0}), bp, slopes, + intercepts, numRows); + + // 1. MatrixBlock mit korrekten Dimensionen + MatrixBlock target = new MatrixBlock(numRows, 1, false); + + // 2. DenseBlock ZUERST alloziieren! + target.allocateDenseBlock(); // Oder target.allocateDenseBlock(true); + + // 3. Jetzt DenseBlock verfügbar + DenseBlock db = target.getDenseBlock(); + assertNotNull(db); // Sicherstellen! + + // 4. Dekomprimieren + cg.decompressToDenseBlock(db, 0, numRows, 0, 0); + + // 5. Prüfen + for(int r = 0; r < numRows; r++) { + double expected = (r < 5) ? 1.0 * r : 2.0 * r + 1.0; + assertEquals("Row " + r, expected, db.get(r, 0), 1e-9); + } + } + + private ColGroupPiecewiseLinearCompressed createTestGroup(int numRows) { + int[] bp = {0, 5, numRows}; + double[] slopes = {1.0, 3.0}; + double[] intercepts = {0.0, 2.0}; + return (ColGroupPiecewiseLinearCompressed) ColGroupPiecewiseLinearCompressed.create( + ColIndexFactory.create(new int[] {0}), bp, slopes, intercepts, numRows); + } + + @Test + public void testDecompressToDenseBlock_fullRange() { + ColGroupPiecewiseLinearCompressed cg = createTestGroup(12); + + MatrixBlock target = new MatrixBlock(12, 1, false); + target.allocateDenseBlock(); + DenseBlock db = target.getDenseBlock(); + + cg.decompressToDenseBlock(db, 0, 12, 0, 0); + + // Segment 0 [0,5): y = r + assertEquals(0.0, db.get(0, 0), 1e-9); + assertEquals(4.0, db.get(4, 0), 1e-9); - @Test - public void testComputeBreakpoints_uniformColumn() { - CompressionSettings cs = new CompressionSettingsBuilder().create(); - cs.setPiecewiseTargetLoss(1e-3); - double[] column = {1.0, 1.0, 1.0, 1.0, 1.0}; // ← Test-spezifisch - List breaks = computeBreakpoints(cs, column); - assertEquals(Arrays.asList(0, 5), breaks); // Erwartet: keine Breaks - } - - @Test - public void testComputeBreakpoints_linearIncreasing() { - CompressionSettings cs = new CompressionSettingsBuilder().create(); - cs.setPiecewiseTargetLoss(1e-3); - double[] column = {0.0, 1.0, 2.0, 3.0, 4.0}; // ← andere column - List breaks = computeBreakpoints(cs, column); - assertEquals(Arrays.asList(0, 5), breaks); // Erwartet - - } - - @Test - public void testComputeBreakpoints_highLoss_uniform() { - CompressionSettings cs = new CompressionSettingsBuilder().create(); - cs.setPiecewiseTargetLoss(10000.0); - double[] column = {1.0, 1.0, 1.0, 1.0, 1.0}; - List breaks = computeBreakpoints(cs, column); - assertEquals(Arrays.asList(0, 5), breaks); - } - - @Test - public void testComputeBreakpoints_twoSegments() { - CompressionSettings cs = new CompressionSettingsBuilder().create(); - cs.setPiecewiseTargetLoss(1e-3); - // {1,1,1, 2,2,2} → 2 Segmente → [0,3,6] - double[] column = {1.0, 1.0, 1.0, 2.0, 2.0, 2.0}; - var breaks = computeBreakpoints(cs, column); - assertEquals(Arrays.asList(0, 3, 6), breaks); - } - - @Test - public void testComputeBreakpoints_noLoss_linear() { - CompressionSettings cs = new CompressionSettingsBuilder().create(); - cs.setPiecewiseTargetLoss(0.0); - //cs.setPiecewiseTargetLoss(0.0); - double[] column = {0.0, 1.0, 2.0, 3.0, 4.0}; - List breaks = computeBreakpoints(cs, column); - assertEquals(Arrays.asList(0, 5), breaks); // bei 0 Loss alle Breaks - } - - @Test - public void testComputeBreakpointsLambda_const() { - double[] column = {1.0, 1.0, 1.0, 1.0, 1.0}; - List breaks = computeBreakpointsLambda(column, 5.0); - assertEquals(Arrays.asList(0, 5), breaks); - - breaks = computeBreakpointsLambda(column, 0.01); - assertEquals(Arrays.asList(0, 5), breaks); - } - - @Test - public void testComputeBreakpointsLambda_twoSegments() { - double[] column = {1.0, 1.0, 1.0, 2.0, 2.0, 2.0}; // 6 Werte - - // mit kleinem lambda -> viele Segmente (kostenlos fast) - List breaks = computeBreakpointsLambda(column, 0.01); - assertTrue(breaks.contains(3)); - assertEquals(3, breaks.size()); - assertEquals(Arrays.asList(0, 3, 6), breaks); - - // mit großem lambda entspricht nur ein Segment - breaks = computeBreakpointsLambda(column, 1000.0); - assertEquals(Arrays.asList(0, 6), breaks); - } - - @Test - public void testComputeBreakpointsLambda_jumpWithTrend() { - double[] column = {0.0, 1.0, 2.0, 10.0, 11.0, 12.0}; - - // grobe Segmentanpassung: ein Segment pro „Abschnitt“ - List breaks = computeBreakpointsLambda(column, 0.5); - assertEquals(Arrays.asList(0, 3, 6), breaks); - - // nur ein Segment, wenn lambda sehr groß - breaks = computeBreakpointsLambda(column, 100.0); - assertEquals(Arrays.asList(0, 6), breaks); - } - - @Test - public void testComputeBreakpointsLambda_linear() { - double[] column = {0.0, 1.0, 2.0, 3.0, 4.0, 5.0}; - - List breaks = computeBreakpointsLambda(column, 1.0); - assertEquals(Arrays.asList(0, 6), breaks); - - // mit sehr kleinem lambda: wir prüfen nur, dass die Grenzen vernünftig sind - breaks = computeBreakpointsLambda(column, 0.001); - assertTrue(breaks.size() >= 2); - assertTrue(breaks.get(0) == 0); - assertTrue(breaks.get(breaks.size() - 1) == column.length); - } - - @Test - public void testComputeBreakpointsLambda_edge_lambdaVerySmall() { - double[] column = {1.0, 1.1, 1.0, 1.1, 1.0}; - - List breaks = computeBreakpointsLambda(column, 0.001); - assertNotNull(breaks); - assertFalse(breaks.isEmpty()); - assertEquals(0, (int) breaks.get(0)); - assertEquals(column.length, (int) breaks.get(breaks.size() - 1)); - - // Prüfe, dass die Liste sortiert ist - for (int i = 1; i < breaks.size(); i++) { - assertTrue(breaks.get(i) >= breaks.get(i - 1)); - } - } - - @Test - public void testComputeBreakpointsLambda_edge_lambdaVeryLarge() { - double[] column = {1.0, 2.0, 1.5, 2.5, 1.8}; - - List breaks = computeBreakpointsLambda(column, 1000.0); - assertEquals(Arrays.asList(0, 5), breaks); - } - - @Test - public void testComputeSegmentCost_emptyOrSingle() { - double[] column = {10.0, 20.0, 30.0}; - - // 0 Elemente (leer) - assertEquals(0.0, computeSegmentCost(column, 0, 0), 1e-10); - assertEquals(0.0, computeSegmentCost(column, 1, 1), 1e-10); - - // 1 Element → Regressionsgerade ist nicht eindeutig definiert, aber SSE=0 - assertEquals(0.0, computeSegmentCost(column, 0, 1), 1e-10); - assertEquals(0.0, computeSegmentCost(column, 1, 2), 1e-10); - assertEquals(0.0, computeSegmentCost(column, 2, 3), 1e-10); - } - - @Test - public void testComputeSegmentCost_twoConstantPoints() { - double[] column = {5.0, 5.0, 1.0, 1.0}; - - // Zwei identische Punkte (konstant) → SSE = 0 - double sse = computeSegmentCost(column, 0, 2); - assertEquals(0.0, sse, 1e-10); - } - - @Test - public void testComputeSegmentCost_twoDifferentPoints() { - double[] column = {0.0, 2.0, 1.0, 3.0}; - - // Zwei Punkte: (0,0) und (1,2) → Gerade y = 2*x, Fehler = 0 - double sse = computeSegmentCost(column, 0, 2); - assertEquals(0.0, sse, 1e-10); - - // Zwei Punkte: (2,1) und (3,3) → Gerade y = 2*x - 3, Fehler = 0 - sse = computeSegmentCost(column, 2, 4); - assertEquals(0.0, sse, 1e-10); - } - - @Test - public void testComputeSegmentCost_constantThree() { - double[] column = {0.0, 0.0, 0.0}; - double sse = computeSegmentCost(column, 0, 3); - assertEquals(0.0, sse, 1e-10); - } - - @Test - public void testComputeSegmentCost_consistent_with_regression() { - double[] column = {0.0, 2.0, 0.0, 4.0, 0.0, 6.0}; - - int start = 0, end = 3; - double[] ab = regressSegment(column, start, end); - double slope = ab[0], intercept = ab[1]; - double sse_hand = 0.0; - for (int i = start; i < end; i++) { - double yhat = slope * i + intercept; - double diff = column[i] - yhat; - sse_hand += diff * diff; - } - - double sse = computeSegmentCost(column, start, end); - assertEquals(sse_hand, sse, 1e-10); - } - - @Test - public void testComputeTotalSSE_emptyBreaks() { - double[] column = {1.0, 2.0, 3.0}; - List breaks = Arrays.asList(); // leer → keine Segmente - double total = computeTotalSSE(column, breaks); - - // 0 Segmente → Summe über 0 Segmente = 0 - assertEquals(0.0, total, 1e-10); - } - - @Test - public void testComputeTotalSSE_singleSegment_all() { - double[] column = {1.0, 2.0, 3.0}; - List breaks = Arrays.asList(0, 3); // ein Segment [0,3) - - double total = computeTotalSSE(column, breaks); - double expected = computeSegmentCost(column, 0, 3); - - // Ergebnis muss exakt das gleiche wie der SSE des gesamten Segments sein - assertEquals(expected, total, 1e-10); - } - - @Test - public void testComputeTotalSSE_twoSegments() { - // Beispiel: [0,0,0] und [1,1,1] (jeweils konstant) - double[] column = {0.0, 0.0, 0.0, 1.0, 1.0, 1.0}; - List breaks = Arrays.asList(0, 3, 6); // zwei Segmente - - double total = computeTotalSSE(column, breaks); - double sse1 = computeSegmentCost(column, 0, 3); // [0,0,0] → SSE = 0 - double sse2 = computeSegmentCost(column, 3, 6); // [1,1,1] → SSE = 0 - - // da beide Segmente konstant sind, muss totalSSE = 0 sein - assertEquals(0.0, total, 1e-10); - assertEquals(sse1 + sse2, total, 1e-10); - } - - @Test - public void testComputeTotalSSE_threeSegments() { - // Ein Segment mit drei identischen Werten, zwei Segmente mit jeweils zwei Werten - double[] column = {1.0, 1.0, 1.0, 2.0, 2.0, 3.0, 3.0}; - List breaks = Arrays.asList(0, 3, 5, 7); - - // Segment [0,3): konstant 1.0 → SSE = 0 - double sse1 = computeSegmentCost(column, 0, 3); // 0 - - // Segment [3,5): [2,2] → SSE = 0 - double sse2 = computeSegmentCost(column, 3, 5); // 0 - - // Segment [5,7): [3,3] → SSE = 0 - double sse3 = computeSegmentCost(column, 5, 7); // 0 - - double total = computeTotalSSE(column, breaks); - assertEquals(0.0, total, 1e-10); - assertEquals(sse1 + sse2 + sse3, total, 1e-10); - } - - @Test - public void testComputeTotalSSE_gapStartEnd() { - double[] column = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0}; - List breaks = Arrays.asList(2, 5, 8); - - double total = computeTotalSSE(column, breaks); - double sse1 = computeSegmentCost(column, 2, 5); - double sse2 = computeSegmentCost(column, 5, 8); - - assertEquals(sse1 + sse2, total, 1e-10); - - } - - @Test - public void testComputeTotalSSE_oneSegment_identical() { - double[] column = {1.0, 2.0, 3.0, 4.0, 5.0}; - double sseTotal = computeSegmentCost(column, 0, 5); - - List breaks = Arrays.asList(0, 5); - double total = computeTotalSSE(column, breaks); - - assertEquals(sseTotal, total, 1e-10); - } - - @Test - public void testComputeTotalSSE_nonConstant() { - double[] column = {0.0, 1.0, 2.0, 3.0, 4.0}; - List breaks = Arrays.asList(0, 2, 5); - - double total = computeTotalSSE(column, breaks); - double sse1 = computeSegmentCost(column, 0, 2); - double sse2 = computeSegmentCost(column, 2, 5); - - assertTrue(total >= 0.0); - assertEquals(sse1 + sse2, total, 1e-10); - } - - @Test - public void testComputeTotalSSE_edgeCases() { - double[] columnEmpty = {}; - List breaksEmpty = Arrays.asList(0, 0); - assertEquals(0.0, computeTotalSSE(columnEmpty, breaksEmpty), 1e-10); - - double[] columnOne = {42.0}; - List breaksOne = Arrays.asList(0, 1); - double total = computeTotalSSE(columnOne, breaksOne); - assertEquals(0.0, total, 1e-10); - } - - @Test - public void testRegressSegment_empty() { - double[] column = {1.0, 2.0, 3.0}; - double[] result = regressSegment(column, 0, 0); - assertEquals(0.0, result[0], 1e-10); - assertEquals(0.0, result[1], 1e-10); - } - - @Test - public void testRegressSegment_singlePoint() { - double[] column = {1.0, 2.0, 3.0}; - double[] result = regressSegment(column, 1, 2); - - assertEquals(0.0, result[0], 1e-10); - assertEquals(2.0, result[1], 1e-10); - } - - @Test - public void testRegressSegment_twoIdentical() { - double[] column = {5.0, 5.0, 1.0, 1.0}; - double[] result = regressSegment(column, 0, 2); - - assertEquals(0.0, result[0], 1e-10); - assertEquals(5.0, result[1], 1e-10); - } - - @Test - public void testRegressSegment_twoPoints() { - double[] column = {0.0, 2.0}; - double[] result = regressSegment(column, 0, 2); - - assertEquals(2.0, result[0], 1e-10); - assertEquals(0.0, result[1], 1e-10); - } - - @Test - public void testRegressSegment_twoPoints_offset() { - - double[] column = {1.0, 3.0, 5.0, 7.0}; - double[] result = regressSegment(column, 2, 4); - - assertEquals(2.0, result[0], 1e-10); - assertEquals(1.0, result[1], 1e-10); - } - - @Test - public void testRegressSegment_constant() { - double[] column = {3.0, 3.0, 3.0, 3.0}; - double[] result = regressSegment(column, 0, 4); - - assertEquals(0.0, result[0], 1e-10); - assertEquals(3.0, result[1], 1e-10); - } - - @Test - public void testRegressSegment_linear() { - double[] column = new double[4]; - double a = 1.5, b = 2.0; - for (int i = 0; i < 4; i++) { - column[i] = a * i + b; - } - - double[] result = regressSegment(column, 0, 4); - - assertEquals(a, result[0], 1e-10); - assertEquals(b, result[1], 1e-10); - } - - @Test - public void testRegressSegment_denomZero() { - double[] column = {10.0}; - double[] result = regressSegment(column, 0, 1); - - assertEquals(0.0, result[0], 1e-10); - assertEquals(10.0, result[1], 1e-10); - } - - @Test - public void testCompressPiecewiseLinearFunctional_const() { - // 1. MatrixBlock mit einer konstanten Spalte erzeugen - int nrows = 20, ncols = 1; - MatrixBlock in = new MatrixBlock(nrows, ncols, false); - for (int r = 0; r < nrows; r++) - in.set(r, 0, 1.0); - // 2. colIndexes für Spalte 0 - IColIndex colIndexes = ColIndexFactory.create(new int[]{0}); - // 3. CompressionSettings mit TargetLoss - CompressionSettings cs = new CompressionSettingsBuilder().create(); - cs.setPiecewiseTargetLoss(1e-6); - // 4. Aufruf der Kompressionsfunktion - AColGroup result = ColGroupFactory.compressPiecewiseLinearFunctional(colIndexes, in, cs); - - // 5. Ergebnis ist eine ColGroupPiecewiseLinearCompressed? - assertTrue(result instanceof ColGroupPiecewiseLinearCompressed); - ColGroupPiecewiseLinearCompressed plGroup = (ColGroupPiecewiseLinearCompressed) result; - - // 6. Breakpoints per Getter, nicht per create() - int[] breakpoints = plGroup.getBreakpoints(); - assertArrayEquals(new int[]{0, 20}, breakpoints); - - // 7. Pro Segment: 1 Segment → ein slope, ein intercept - double[] slopes = plGroup.getSlopes(); - double[] intercepts = plGroup.getIntercepts(); - assertEquals(1, slopes.length); - assertEquals(1, intercepts.length); - - // 8. Für konstante Daten: Steigung ~0, intercept ~1.0 - assertEquals(0.0, slopes[0], 1e-10); - assertEquals(1.0, intercepts[0], 1e-10); - - // 9. Check: colIndexes stimmt - IColIndex idx = plGroup.getColIndices(); - assertEquals(1, idx.size()); - assertEquals(0, idx.get(0)); - } - - @Test(expected = IllegalArgumentException.class) - public void testCreate_nullBreakpoints() { - int[] nullBp = null; - ColGroupPiecewiseLinearCompressed.create( - ColIndexFactory.create(new int[]{0}), nullBp, new double[]{1.0}, new double[]{0.0}, 10); - } - - @Test(expected = IllegalArgumentException.class) - public void testCreate_tooFewBreakpoints() { - int[] singleBp = {0}; - ColGroupPiecewiseLinearCompressed.create( - ColIndexFactory.create(new int[]{0}), singleBp, new double[]{1.0}, new double[]{0.0}, 10); - } - - @Test(expected = IllegalArgumentException.class) - public void testCreate_inconsistentSlopes() { - int[] bp = {0, 5, 10}; - ColGroupPiecewiseLinearCompressed.create( - ColIndexFactory.create(new int[]{0}), bp, new double[]{1.0, 2.0, 3.0}, - new double[]{0.0, 1.0}, 10); - } - - @Test(expected = IllegalArgumentException.class) - public void testCreate_inconsistentIntercepts() { - int[] bp = {0, 5, 10}; - ColGroupPiecewiseLinearCompressed.create( - ColIndexFactory.create(new int[]{0}), bp, new double[]{1.0, 2.0}, - new double[]{0.0}, 10); - } - - @Test - public void testCreate_validMultiSegment() { - int[] bp = {0, 3, 7, 10}; - double[] slopes = {1.0, -2.0, 0.5}; - double[] intercepts = {0.0, 5.0, -1.0}; - IColIndex cols = ColIndexFactory.create(new int[]{0, 1}); - - AColGroup cg = ColGroupPiecewiseLinearCompressed.create(cols, bp, slopes, intercepts, 10); - - assertTrue(cg instanceof ColGroupPiecewiseLinearCompressed); - assertNotSame(bp, ((ColGroupPiecewiseLinearCompressed) cg).getBreakpoints()); - } - - @Test - public void testCreate_multiColumn() { - IColIndex cols = ColIndexFactory.create(new int[]{5, 10, 15}); - int[] bp = {0, 5}; - double[] slopes = {3.0}; - double[] intercepts = {2.0}; - - AColGroup cg = ColGroupPiecewiseLinearCompressed.create(cols, bp, slopes, intercepts, 100); - assertTrue(cg instanceof ColGroupPiecewiseLinearCompressed); - assertTrue(cg instanceof ColGroupPiecewiseLinearCompressed); - - // - assertTrue(cg.getNumValues() > 0); - - for (int r = 0; r < 5; r++) { - double expected = 3.0 * r + 2.0; - // colIdx=0 → globale Spalte 5 - assertEquals(expected, cg.getIdx(r, 0), 1e-9); - // colIdx=1 → globale Spalte 10 - assertEquals(expected, cg.getIdx(r, 1), 1e-9); - // colIdx=2 → globale Spalte 15 - assertEquals(expected, cg.getIdx(r, 2), 1e-9); - } - - for (int r = 5; r < 10; r++) { - double expected = 3.0 * r + 2.0; - assertEquals(expected, cg.getIdx(r, 0), 1e-9); // Alle Columns gleich - } - assertEquals(cols.size(), 3); - } - - @Test - public void testCreate_singleColumn() { - IColIndex cols = ColIndexFactory.create(new int[]{5}); - int[] bp = {0, 5}; - double[] slopes = {3.0}; - double[] intercepts = {2.0}; - int numRows = 10; - - AColGroup cg = ColGroupPiecewiseLinearCompressed.create(cols, bp, slopes, intercepts, numRows); - - assertTrue(cg instanceof ColGroupPiecewiseLinearCompressed); - - assertEquals(2.0, cg.getIdx(0, 0), 1e-9); // 3*0 + 2 - assertEquals(5.0, cg.getIdx(1, 0), 1e-9); // 3*1 + 2 - } - - @Test - public void testCreate_validMinimal() { - - // 1 Segment: [0,10] → y = 2.0 * r + 1.0 - int[] bp = {0, 10}; - double[] slopes = {2.0}; - double[] intercepts = {1.0}; - IColIndex cols = ColIndexFactory.create(new int[]{0}); - int numRows = 10; - - AColGroup cg = ColGroupPiecewiseLinearCompressed.create(cols, bp, slopes, intercepts, numRows); - - // Korrekte Instanz - assertTrue(cg instanceof ColGroupPiecewiseLinearCompressed); - - // getNumValues() > 0 - assertTrue(cg.getNumValues() > 0); - - // r < numRows - for (int r = 0; r < numRows; r++) { - double expected = 2.0 * r + 1.0; - assertEquals("Row " + r, expected, cg.getIdx(r, 0), 1e-9); - } - - // Letzte gültige Row - assertEquals(19.0, cg.getIdx(9, 0), 1e-9); - - //Out-of-Bounds korrekt 0.0 - assertEquals(0.0, cg.getIdx(10, 0), 1e-9); - assertEquals(0.0, cg.getIdx(9, 1), 1e-9); - } - - @Test - public void testDecompressToDenseBlock() { - int[] bp = {0, 5, 10}; - double[] slopes = {1.0, 2.0}; - double[] intercepts = {0.0, 1.0}; - int numRows = 10; - - AColGroup cg = ColGroupPiecewiseLinearCompressed.create( - ColIndexFactory.create(new int[]{0}), bp, slopes, intercepts, numRows); - - // 1. MatrixBlock mit korrekten Dimensionen - MatrixBlock target = new MatrixBlock(numRows, 1, false); - - // 2. DenseBlock ZUERST alloziieren! - target.allocateDenseBlock(); // Oder target.allocateDenseBlock(true); - - // 3. Jetzt DenseBlock verfügbar - DenseBlock db = target.getDenseBlock(); - assertNotNull(db); // Sicherstellen! - - // 4. Dekomprimieren - cg.decompressToDenseBlock(db, 0, numRows, 0, 0); - - // 5. Prüfen - for (int r = 0; r < numRows; r++) { - double expected = (r < 5) ? 1.0 * r : 2.0 * r + 1.0; - assertEquals("Row " + r, expected, db.get(r, 0), 1e-9); - } - } - - private ColGroupPiecewiseLinearCompressed createTestGroup(int numRows) { - int[] bp = {0, 5, numRows}; - double[] slopes = {1.0, 3.0}; - double[] intercepts = {0.0, 2.0}; - return (ColGroupPiecewiseLinearCompressed) ColGroupPiecewiseLinearCompressed.create( - ColIndexFactory.create(new int[]{0}), bp, slopes, intercepts, numRows); - } - - @Test - public void testDecompressToDenseBlock_fullRange() { - ColGroupPiecewiseLinearCompressed cg = createTestGroup(12); - - MatrixBlock target = new MatrixBlock(12, 1, false); - target.allocateDenseBlock(); - DenseBlock db = target.getDenseBlock(); - - cg.decompressToDenseBlock(db, 0, 12, 0, 0); - - // Segment 0 [0,5): y = r - assertEquals(0.0, db.get(0, 0), 1e-9); - assertEquals(4.0, db.get(4, 0), 1e-9); - - assertEquals(17.0, db.get(5, 0), 1e-9); - assertEquals(29.0, db.get(9, 0), 1e-9); - assertEquals(32.0, db.get(10, 0), 1e-9); - assertEquals(35.0, db.get(11, 0), 1e-9); - } - - - - @Test - public void testDecompressToDenseBlock_partialRange() { - ColGroupPiecewiseLinearCompressed cg = createTestGroup(12); - - MatrixBlock target = new MatrixBlock(12, 1, false); - target.allocateDenseBlock(); - DenseBlock db = target.getDenseBlock(); - - // rl=6, ru=9 → r=6,7,8 dekomprimieren - // offR=0 → schreibt in Target-Rows 6,7,8 - cg.decompressToDenseBlock(db, 6, 9, 0, 0); - - - assertEquals(0.0, db.get(0, 0), 1e-9); // Unberührt (vor rl=6) - assertEquals(20.0, db.get(6, 0), 1e-9); - assertEquals(23.0, db.get(7, 0), 1e-9); - assertEquals(26.0, db.get(8, 0), 1e-9); - assertEquals(0.0, db.get(9, 0), 1e-9); // Unberührt (nach ru=9) - } - - - @Test - public void testDecompressToDenseBlock_emptyRange() { - ColGroupPiecewiseLinearCompressed cg = createTestGroup(12); - - MatrixBlock target = new MatrixBlock(5, 1, false); - target.allocateDenseBlock(); - DenseBlock db = target.getDenseBlock(); - - // Leerer Bereich - cg.decompressToDenseBlock(db, 12, 12, 0, 0); // rl=ru - cg.decompressToDenseBlock(db, 3, 2, 0, 0); // rl>ru - - // Alles bleibt 0.0 - for (int r = 0; r < 5; r++) { - assertEquals(0.0, db.get(r, 0), 1e-9); - } - } - - @Test - public void testDecompressToDenseBlock_nullSafety() { - ColGroupPiecewiseLinearCompressed cg = createTestGroup(10); + assertEquals(17.0, db.get(5, 0), 1e-9); + assertEquals(29.0, db.get(9, 0), 1e-9); + assertEquals(32.0, db.get(10, 0), 1e-9); + assertEquals(35.0, db.get(11, 0), 1e-9); + } - // Null DenseBlock - cg.decompressToDenseBlock(null, 0, 10, 0, 0); + @Test + public void testDecompressToDenseBlock_partialRange() { + ColGroupPiecewiseLinearCompressed cg = createTestGroup(12); - // Ungültige Parameter (leerer Bereich) - MatrixBlock target = new MatrixBlock(10, 1, false); - target.allocateDenseBlock(); - DenseBlock db = target.getDenseBlock(); + MatrixBlock target = new MatrixBlock(12, 1, false); + target.allocateDenseBlock(); + DenseBlock db = target.getDenseBlock(); + + // rl=6, ru=9 → r=6,7,8 dekomprimieren + // offR=0 → schreibt in Target-Rows 6,7,8 + cg.decompressToDenseBlock(db, 6, 9, 0, 0); + + assertEquals(0.0, db.get(0, 0), 1e-9); // Unberührt (vor rl=6) + assertEquals(20.0, db.get(6, 0), 1e-9); + assertEquals(23.0, db.get(7, 0), 1e-9); + assertEquals(26.0, db.get(8, 0), 1e-9); + assertEquals(0.0, db.get(9, 0), 1e-9); // Unberührt (nach ru=9) + } - cg.decompressToDenseBlock(db, 12, 12, 0, 0); // rl == ru - cg.decompressToDenseBlock(db, 5, 2, 0, 0); // rl > ru + @Test + public void testDecompressToDenseBlock_emptyRange() { + ColGroupPiecewiseLinearCompressed cg = createTestGroup(12); + + MatrixBlock target = new MatrixBlock(5, 1, false); + target.allocateDenseBlock(); + DenseBlock db = target.getDenseBlock(); + + // Leerer Bereich + cg.decompressToDenseBlock(db, 12, 12, 0, 0); // rl=ru + cg.decompressToDenseBlock(db, 3, 2, 0, 0); // rl>ru + + // Alles bleibt 0.0 + for(int r = 0; r < 5; r++) { + assertEquals(0.0, db.get(r, 0), 1e-9); + } + } + + @Test + public void testDecompressToDenseBlock_nullSafety() { + ColGroupPiecewiseLinearCompressed cg = createTestGroup(10); + + // Null DenseBlock + cg.decompressToDenseBlock(null, 0, 10, 0, 0); + + // Ungültige Parameter (leerer Bereich) + MatrixBlock target = new MatrixBlock(10, 1, false); + target.allocateDenseBlock(); + DenseBlock db = target.getDenseBlock(); - // Target unverändert - for (int r = 0; r < 10; r++) { - assertEquals(0.0, db.get(r, 0), 1e-9); - } - } - private CompressedSizeInfo createTestCompressedSizeInfo() { - IColIndex cols = ColIndexFactory.create(new int[]{0}); - EstimationFactors facts = new EstimationFactors(2, 10); + cg.decompressToDenseBlock(db, 12, 12, 0, 0); // rl == ru + cg.decompressToDenseBlock(db, 5, 2, 0, 0); // rl > ru - CompressedSizeInfoColGroup info = new CompressedSizeInfoColGroup( - cols, facts, AColGroup.CompressionType.PiecewiseLinear); + // Target unverändert + for(int r = 0; r < 10; r++) { + assertEquals(0.0, db.get(r, 0), 1e-9); + } + } - List infos = Arrays.asList(info); - CompressedSizeInfo csi = new CompressedSizeInfo(infos); + private CompressedSizeInfo createTestCompressedSizeInfo() { + IColIndex cols = ColIndexFactory.create(new int[] {0}); + EstimationFactors facts = new EstimationFactors(2, 10); - return csi; - } + CompressedSizeInfoColGroup info = new CompressedSizeInfoColGroup(cols, facts, + AColGroup.CompressionType.PiecewiseLinear); + + List infos = Arrays.asList(info); + CompressedSizeInfo csi = new CompressedSizeInfo(infos); + + return csi; + } - @Test - public void testCompressPiecewiseLinear_viaRealAPI() { + @Test + public void testCompressPiecewiseLinear_viaRealAPI() { - MatrixBlock in = new MatrixBlock(10, 1, false); - in.allocateDenseBlock(); - for (int r = 0; r < 10; r++) { - in.set(r, 0, r * 0.5); - } + MatrixBlock in = new MatrixBlock(10, 1, false); + in.allocateDenseBlock(); + for(int r = 0; r < 10; r++) { + in.set(r, 0, r * 0.5); + } - CompressionSettings cs = new CompressionSettingsBuilder() - .addValidCompression(AColGroup.CompressionType.PiecewiseLinear) - .create(); + CompressionSettings cs = new CompressionSettingsBuilder().addValidCompression( + AColGroup.CompressionType.PiecewiseLinear).create(); - CompressedSizeInfo csi = createTestCompressedSizeInfo(); + CompressedSizeInfo csi = createTestCompressedSizeInfo(); - List colGroups = ColGroupFactory.compressColGroups(in, csi, cs); + List colGroups = ColGroupFactory.compressColGroups(in, csi, cs); - boolean hasPiecewise = colGroups.stream() - .anyMatch(cg -> cg instanceof ColGroupPiecewiseLinearCompressed); - assertTrue(hasPiecewise); - } + boolean hasPiecewise = colGroups.stream().anyMatch(cg -> cg instanceof ColGroupPiecewiseLinearCompressed); + assertTrue(hasPiecewise); + } -} \ No newline at end of file +} From 0faa2f830b34c881bb2ee573764dcd6ba1202522 Mon Sep 17 00:00:00 2001 From: mori49 <84979219+mori49@users.noreply.github.com> Date: Thu, 5 Feb 2026 16:23:38 +0100 Subject: [PATCH 13/35] wip: fix formattaing fix: fix efficiency getIdx() --- .../ColGroupPiecewiseLinearCompressed.java | 214 ++++++++++-------- 1 file changed, 118 insertions(+), 96 deletions(-) rename src/main/java/org/apache/sysds/runtime/compress/colgroup/{scheme => }/ColGroupPiecewiseLinearCompressed.java (63%) diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/scheme/ColGroupPiecewiseLinearCompressed.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupPiecewiseLinearCompressed.java similarity index 63% rename from src/main/java/org/apache/sysds/runtime/compress/colgroup/scheme/ColGroupPiecewiseLinearCompressed.java rename to src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupPiecewiseLinearCompressed.java index 1f39dc44cb0..35891eb8c53 100644 --- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/scheme/ColGroupPiecewiseLinearCompressed.java +++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupPiecewiseLinearCompressed.java @@ -1,9 +1,8 @@ -package org.apache.sysds.runtime.compress.colgroup.scheme; +package org.apache.sysds.runtime.compress.colgroup; -import org.apache.sysds.runtime.compress.colgroup.AColGroup; -import org.apache.sysds.runtime.compress.colgroup.AColGroupCompressed; -import org.apache.sysds.runtime.compress.colgroup.ColGroupUtils; +import org.apache.commons.lang3.NotImplementedException; import org.apache.sysds.runtime.compress.colgroup.indexes.IColIndex; +import org.apache.sysds.runtime.compress.colgroup.scheme.ICLAScheme; import org.apache.sysds.runtime.compress.cost.ComputationCostEstimator; import org.apache.sysds.runtime.compress.estim.CompressedSizeInfoColGroup; import org.apache.sysds.runtime.data.DenseBlock; @@ -34,6 +33,7 @@ protected ColGroupPiecewiseLinearCompressed(IColIndex colIndices) { public ColGroupPiecewiseLinearCompressed(IColIndex colIndexes, int[] breakpoints, double[] slopes, double[] intercepts, int numRows) { super(colIndexes); + this.colIndexes = colIndexes; this.breakpoints = breakpoints; this.slopes = slopes; this.intercepts = intercepts; @@ -60,333 +60,355 @@ public static AColGroup create(IColIndex colIndexes, int[] breakpoints, double[] @Override public void decompressToDenseBlock(DenseBlock db, int rl, int ru, int offR, int offC) { - if(db == null || _colIndexes == null || _colIndexes.size() == 0 || breakpoints == null || slopes == null || + //Safety-Check: + if(db == null || colIndexes == null || colIndexes.size() == 0 || breakpoints == null || slopes == null || intercepts == null) { return; } - - int numSeg = breakpoints.length - 1; - if(numSeg <= 0 || rl >= ru) { + //Validate Segments + int sizeSegment = breakpoints.length - 1; + if(sizeSegment <= 0 || rl >= ru) { return; } - - final int col = _colIndexes.get(0); - - for(int s = 0; s < numSeg; s++) { - int segStart = breakpoints[s]; - int segEnd = breakpoints[s + 1]; + //Find every Segment + final int column = _colIndexes.get(0); + for(int currentSeg = 0; currentSeg < sizeSegment; currentSeg++) { + int segStart = breakpoints[currentSeg]; + int segEnd = breakpoints[currentSeg + 1]; if(segStart >= segEnd) - continue; // Invalid Segment + continue; - double a = slopes[s]; - double b = intercepts[s]; + double currentSlope = slopes[currentSeg]; + double currentIntercepts = intercepts[currentSeg]; - int rs = Math.max(segStart, rl); - int re = Math.min(segEnd, ru); - if(rs >= re) + int rowStart = Math.max(segStart, rl); + int rowEnd = Math.min(segEnd, ru); + if(rowStart >= rowEnd) continue; - for(int r = rs; r < re; r++) { - double yhat = a * r + b; - int gr = offR + r; - int gc = offC + col; + // Filling DenseBlock Matrix + for(int r = rowStart; r < rowEnd; r++) { + double yhat = currentSlope * r + currentIntercepts; + int dbRow = offR + r; + int dbColumn = offC + column; - if(gr >= 0 && gr < db.numRows() && gc >= 0 && gc < db.numCols()) { - db.set(gr, gc, yhat); + if(dbRow >= 0 && dbRow < db.numRows() && dbColumn >= 0 && dbColumn < db.numCols()) { + db.set(dbRow, dbColumn, yhat); } } } } + public int[] getBreakpoints() { + return breakpoints; + } + + public double[] getSlopes() { + return slopes; + } + + public double[] getIntercepts() { + return intercepts; + } + + @Override + public double getIdx(int r, int colIdx) { + //Check if the rowIDx is valid (safety check) + if(r < 0 || r >= numRows || colIdx < 0 || colIdx >= colIndexes.size()) { + return 0.0; + } + // Using Binary Search for efficient Search for the right Segment ( finding rowIdx r) + // have to use int higherBound = breakpoints.length - 2 because it's the last valid segment + int lowerBound = 0; + int higherBound = breakpoints.length - 2; + while(lowerBound <= higherBound) { + int mid = (lowerBound + higherBound) / 2; + if(r < breakpoints[mid] + 1) { + higherBound = mid - 1; + } + else + lowerBound = mid + 1; + } + int segment = Math.min(lowerBound, breakpoints.length - 2); + + return slopes[segment] * (double) r + intercepts[segment]; + } + + @Override + public int getNumValues() { + return breakpoints.length + slopes.length + intercepts.length; + } + @Override protected double computeMxx(double c, Builtin builtin) { - return 0; + throw new NotImplementedException(); } @Override protected void computeColMxx(double[] c, Builtin builtin) { - + throw new NotImplementedException(); } @Override protected void computeSum(double[] c, int nRows) { + throw new NotImplementedException(); } @Override protected void computeSumSq(double[] c, int nRows) { + throw new NotImplementedException(); } @Override protected void computeColSumsSq(double[] c, int nRows) { + throw new NotImplementedException(); } @Override protected void computeRowSums(double[] c, int rl, int ru, double[] preAgg) { + throw new NotImplementedException(); } @Override protected void computeRowMxx(double[] c, Builtin builtin, int rl, int ru, double[] preAgg) { + throw new NotImplementedException(); } @Override protected void computeProduct(double[] c, int nRows) { + throw new NotImplementedException(); } @Override protected void computeRowProduct(double[] c, int rl, int ru, double[] preAgg) { + throw new NotImplementedException(); } @Override protected void computeColProduct(double[] c, int nRows) { + throw new NotImplementedException(); } @Override protected double[] preAggSumRows() { - return new double[0]; + throw new NotImplementedException(); } @Override protected double[] preAggSumSqRows() { - return new double[0]; + throw new NotImplementedException(); } @Override protected double[] preAggProductRows() { - return new double[0]; + throw new NotImplementedException(); } @Override protected double[] preAggBuiltinRows(Builtin builtin) { - return new double[0]; + throw new NotImplementedException(); } @Override public boolean sameIndexStructure(AColGroupCompressed that) { - return false; + throw new NotImplementedException(); } @Override protected void tsmm(double[] result, int numColumns, int nRows) { + throw new NotImplementedException(); } @Override public AColGroup copyAndSet(IColIndex colIndexes) { - return null; + throw new NotImplementedException(); } @Override public void decompressToDenseBlockTransposed(DenseBlock db, int rl, int ru) { + throw new NotImplementedException(); } @Override public void decompressToSparseBlockTransposed(SparseBlockMCSR sb, int nColOut) { + throw new NotImplementedException(); } - @Override - public double getIdx(int r, int colIdx) { - // ✅ CRUCIAL: Bounds-Check für colIdx! - if(r < 0 || r >= numRows || colIdx < 0 || colIdx >= _colIndexes.size()) { - return 0.0; - } - - // Segment-Suche (sicher jetzt) - int seg = 0; - for(int i = 1; i < breakpoints.length; i++) { - if(r < breakpoints[i]) { - break; - } - seg = i - 1; // seg < numSeg immer! - } - - return slopes[seg] * (double) r + intercepts[seg]; - } - - @Override - public int getNumValues() { - return breakpoints.length + slopes.length + intercepts.length; - } - @Override public CompressionType getCompType() { - return null; + throw new NotImplementedException(); } @Override protected ColGroupType getColGroupType() { - return null; + throw new NotImplementedException(); } @Override public void decompressToSparseBlock(SparseBlock sb, int rl, int ru, int offR, int offC) { + throw new NotImplementedException(); } @Override public AColGroup rightMultByMatrix(MatrixBlock right, IColIndex allCols, int k) { - return null; + throw new NotImplementedException(); } @Override public void leftMultByMatrixNoPreAgg(MatrixBlock matrix, MatrixBlock result, int rl, int ru, int cl, int cu) { + throw new NotImplementedException(); } @Override public void leftMultByAColGroup(AColGroup lhs, MatrixBlock result, int nRows) { + throw new NotImplementedException(); } @Override public void tsmmAColGroup(AColGroup other, MatrixBlock result) { + throw new NotImplementedException(); } @Override public AColGroup scalarOperation(ScalarOperator op) { - return null; + throw new NotImplementedException(); } @Override public AColGroup binaryRowOpLeft(BinaryOperator op, double[] v, boolean isRowSafe) { - return null; + throw new NotImplementedException(); } @Override public AColGroup binaryRowOpRight(BinaryOperator op, double[] v, boolean isRowSafe) { - return null; + throw new NotImplementedException(); } @Override protected AColGroup sliceSingleColumn(int idx) { - return null; + throw new NotImplementedException(); } @Override protected AColGroup sliceMultiColumns(int idStart, int idEnd, IColIndex outputCols) { - return null; + throw new NotImplementedException(); } @Override public AColGroup sliceRows(int rl, int ru) { - return null; + throw new NotImplementedException(); } @Override public boolean containsValue(double pattern) { - return false; + throw new NotImplementedException(); } @Override public long getNumberNonZeros(int nRows) { - return 0; + throw new NotImplementedException(); } @Override public AColGroup replace(double pattern, double replace) { - return null; + throw new NotImplementedException(); } @Override public void computeColSums(double[] c, int nRows) { + throw new NotImplementedException(); } @Override public CmCovObject centralMoment(CMOperator op, int nRows) { - return null; + throw new NotImplementedException(); } @Override public AColGroup rexpandCols(int max, boolean ignore, boolean cast, int nRows) { - return null; + throw new NotImplementedException(); } @Override public double getCost(ComputationCostEstimator e, int nRows) { - return 0; + throw new NotImplementedException(); } @Override public AColGroup unaryOperation(UnaryOperator op) { - return null; + throw new NotImplementedException(); } @Override public AColGroup append(AColGroup g) { - return null; + throw new NotImplementedException(); } @Override protected AColGroup appendNInternal(AColGroup[] groups, int blen, int rlen) { - return null; + throw new NotImplementedException(); } @Override public ICLAScheme getCompressionScheme() { - return null; + throw new NotImplementedException(); } @Override public AColGroup recompress() { - return null; + throw new NotImplementedException(); } @Override public CompressedSizeInfoColGroup getCompressionInfo(int nRow) { - return null; + throw new NotImplementedException(); } @Override protected AColGroup fixColIndexes(IColIndex newColIndex, int[] reordering) { - return null; + throw new NotImplementedException(); } @Override public AColGroup reduceCols() { - return null; + throw new NotImplementedException(); } @Override public double getSparsity() { - return 0; + throw new NotImplementedException(); } @Override protected void sparseSelection(MatrixBlock selection, ColGroupUtils.P[] points, MatrixBlock ret, int rl, int ru) { - + throw new NotImplementedException(); } @Override protected void denseSelection(MatrixBlock selection, ColGroupUtils.P[] points, MatrixBlock ret, int rl, int ru) { - + throw new NotImplementedException(); } @Override public AColGroup[] splitReshape(int multiplier, int nRow, int nColOrg) { - return new AColGroup[0]; - } - - public int[] getBreakpoints() { - return breakpoints; + throw new NotImplementedException(); } - public double[] getSlopes() { - return slopes; - } - - public double[] getIntercepts() { - return intercepts; - } } From 698a942eb27a142a35ebf8bfb780b9e7a6f55143 Mon Sep 17 00:00:00 2001 From: mori49 <84979219+mori49@users.noreply.github.com> Date: Thu, 5 Feb 2026 16:24:41 +0100 Subject: [PATCH 14/35] fix: reverted file --- .../compress/CompressionSettingsBuilder.java | 606 +++++++++--------- 1 file changed, 303 insertions(+), 303 deletions(-) diff --git a/src/main/java/org/apache/sysds/runtime/compress/CompressionSettingsBuilder.java b/src/main/java/org/apache/sysds/runtime/compress/CompressionSettingsBuilder.java index 9af1b5aff2e..02c9f97498d 100644 --- a/src/main/java/org/apache/sysds/runtime/compress/CompressionSettingsBuilder.java +++ b/src/main/java/org/apache/sysds/runtime/compress/CompressionSettingsBuilder.java @@ -34,332 +34,332 @@ * Builder pattern for Compression Settings. See CompressionSettings for details on values. */ public class CompressionSettingsBuilder { - private double samplingRatio; - private double samplePower = 0.65; - private boolean allowSharedDictionary = false; - private String transposeInput; - private int seed = -1; - private boolean lossy = false; - private EnumSet validCompressions; - private boolean sortValuesByLength = true; - private int maxColGroupCoCode = 10000; - private double coCodePercentage = 0.01; - private int minimumSampleSize = 3000; - private int maxSampleSize = 1000000; - private EstimationType estimationType = EstimationType.HassAndStokes; - private PartitionerType columnPartitioner; - private CostType costType; - private double minimumCompressionRatio = 1.0; - private boolean isInSparkInstruction = false; - private SORT_TYPE sdcSortType = SORT_TYPE.MATERIALIZE; - private double[] scaleFactors = null; - private boolean preferDeltaEncoding = false; + private double samplingRatio; + private double samplePower = 0.65; + private boolean allowSharedDictionary = false; + private String transposeInput; + private int seed = -1; + private boolean lossy = false; + private EnumSet validCompressions; + private boolean sortValuesByLength = true; + private int maxColGroupCoCode = 10000; + private double coCodePercentage = 0.01; + private int minimumSampleSize = 3000; + private int maxSampleSize = 1000000; + private EstimationType estimationType = EstimationType.HassAndStokes; + private PartitionerType columnPartitioner; + private CostType costType; + private double minimumCompressionRatio = 1.0; + private boolean isInSparkInstruction = false; + private SORT_TYPE sdcSortType = SORT_TYPE.MATERIALIZE; + private double[] scaleFactors = null; + private boolean preferDeltaEncoding = false; - public CompressionSettingsBuilder() { + public CompressionSettingsBuilder() { - DMLConfig conf = ConfigurationManager.getDMLConfig(); - this.lossy = conf.getBooleanValue(DMLConfig.COMPRESSED_LOSSY); - this.validCompressions = EnumSet.of(CompressionType.UNCOMPRESSED, CompressionType.CONST, CompressionType.EMPTY); - String[] validCompressionsString = conf.getTextValue(DMLConfig.COMPRESSED_VALID_COMPRESSIONS).split(","); - for(String comp : validCompressionsString) - validCompressions.add(CompressionType.valueOf(comp)); - samplingRatio = conf.getDoubleValue(DMLConfig.COMPRESSED_SAMPLING_RATIO); - columnPartitioner = PartitionerType.valueOf(conf.getTextValue(DMLConfig.COMPRESSED_COCODE)); - costType = CostType.valueOf(conf.getTextValue(DMLConfig.COMPRESSED_COST_MODEL)); - transposeInput = conf.getTextValue(DMLConfig.COMPRESSED_TRANSPOSE); - seed = DMLScript.SEED; + DMLConfig conf = ConfigurationManager.getDMLConfig(); + this.lossy = conf.getBooleanValue(DMLConfig.COMPRESSED_LOSSY); + this.validCompressions = EnumSet.of(CompressionType.UNCOMPRESSED, CompressionType.CONST, CompressionType.EMPTY); + String[] validCompressionsString = conf.getTextValue(DMLConfig.COMPRESSED_VALID_COMPRESSIONS).split(","); + for(String comp : validCompressionsString) + validCompressions.add(CompressionType.valueOf(comp)); + samplingRatio = conf.getDoubleValue(DMLConfig.COMPRESSED_SAMPLING_RATIO); + columnPartitioner = PartitionerType.valueOf(conf.getTextValue(DMLConfig.COMPRESSED_COCODE)); + costType = CostType.valueOf(conf.getTextValue(DMLConfig.COMPRESSED_COST_MODEL)); + transposeInput = conf.getTextValue(DMLConfig.COMPRESSED_TRANSPOSE); + seed = DMLScript.SEED; - } + } - /** - * Sets the scale factors for compression, enabling quantization-fused compression. - * - * @param scaleFactors An array of scale factors applied during compression. - * - If row-wise scaling is used, this should be an array where each value corresponds to a row. - * - If a single scalar is provided, it is applied uniformly to the entire matrix. - * @return The CompressionSettingsBuilder instance with the updated scale factors. - */ - public CompressionSettingsBuilder setScaleFactor(double[] scaleFactors) { - this.scaleFactors = scaleFactors; - return this; - } + /** + * Sets the scale factors for compression, enabling quantization-fused compression. + * + * @param scaleFactors An array of scale factors applied during compression. + * - If row-wise scaling is used, this should be an array where each value corresponds to a row. + * - If a single scalar is provided, it is applied uniformly to the entire matrix. + * @return The CompressionSettingsBuilder instance with the updated scale factors. + */ + public CompressionSettingsBuilder setScaleFactor(double[] scaleFactors) { + this.scaleFactors = scaleFactors; + return this; + } - /** - * Copy the settings from another CompressionSettings Builder, modifies this, not that. - * - * @param that The other CompressionSettingsBuilder to copy settings from. - * @return The modified CompressionSettings in the same object. - */ - public CompressionSettingsBuilder copySettings(CompressionSettings that) { - this.samplingRatio = that.samplingRatio; - this.allowSharedDictionary = that.allowSharedDictionary; - this.transposeInput = that.transposeInput; - this.seed = that.seed; - this.lossy = that.lossy; - this.validCompressions = EnumSet.copyOf(that.validCompressions); - this.sortValuesByLength = that.sortTuplesByFrequency; - this.columnPartitioner = that.columnPartitioner; - this.maxColGroupCoCode = that.maxColGroupCoCode; - this.coCodePercentage = that.coCodePercentage; - this.minimumSampleSize = that.minimumSampleSize; - this.preferDeltaEncoding = that.preferDeltaEncoding; - return this; - } + /** + * Copy the settings from another CompressionSettings Builder, modifies this, not that. + * + * @param that The other CompressionSettingsBuilder to copy settings from. + * @return The modified CompressionSettings in the same object. + */ + public CompressionSettingsBuilder copySettings(CompressionSettings that) { + this.samplingRatio = that.samplingRatio; + this.allowSharedDictionary = that.allowSharedDictionary; + this.transposeInput = that.transposeInput; + this.seed = that.seed; + this.lossy = that.lossy; + this.validCompressions = EnumSet.copyOf(that.validCompressions); + this.sortValuesByLength = that.sortTuplesByFrequency; + this.columnPartitioner = that.columnPartitioner; + this.maxColGroupCoCode = that.maxColGroupCoCode; + this.coCodePercentage = that.coCodePercentage; + this.minimumSampleSize = that.minimumSampleSize; + this.preferDeltaEncoding = that.preferDeltaEncoding; + return this; + } - /** - * Set the Compression to use Lossy compression. - * - * @param lossy A boolean specifying if the compression should be lossy - * @return The CompressionSettingsBuilder - */ - public CompressionSettingsBuilder setLossy(boolean lossy) { - this.lossy = lossy; - return this; - } + /** + * Set the Compression to use Lossy compression. + * + * @param lossy A boolean specifying if the compression should be lossy + * @return The CompressionSettingsBuilder + */ + public CompressionSettingsBuilder setLossy(boolean lossy) { + this.lossy = lossy; + return this; + } - /** - * Set the sampling ratio in percent to sample the input matrix. Input value should be in range 0.0 - 1.0 - * - * @param samplingRatio The ratio to sample from the input - * @return The CompressionSettingsBuilder - */ - public CompressionSettingsBuilder setSamplingRatio(double samplingRatio) { - this.samplingRatio = samplingRatio; - return this; - } + /** + * Set the sampling ratio in percent to sample the input matrix. Input value should be in range 0.0 - 1.0 + * + * @param samplingRatio The ratio to sample from the input + * @return The CompressionSettingsBuilder + */ + public CompressionSettingsBuilder setSamplingRatio(double samplingRatio) { + this.samplingRatio = samplingRatio; + return this; + } - /** - * Set the sortValuesByLength flag. This sorts the dictionaries containing the data based on their occurences in the - * ColGroup. Improving cache efficiency especially for diverse column groups. - * - * @param sortValuesByLength A boolean specifying if the values should be sorted - * @return The CompressionSettingsBuilder - */ - public CompressionSettingsBuilder setSortValuesByLength(boolean sortValuesByLength) { - this.sortValuesByLength = sortValuesByLength; - return this; - } + /** + * Set the sortValuesByLength flag. This sorts the dictionaries containing the data based on their occurences in the + * ColGroup. Improving cache efficiency especially for diverse column groups. + * + * @param sortValuesByLength A boolean specifying if the values should be sorted + * @return The CompressionSettingsBuilder + */ + public CompressionSettingsBuilder setSortValuesByLength(boolean sortValuesByLength) { + this.sortValuesByLength = sortValuesByLength; + return this; + } - /** - * Allow the Dictionaries to be shared between different column groups. - * - * @param allowSharedDictionary A boolean specifying if the dictionary can be shared between column groups. - * @return The CompressionSettingsBuilder - */ - public CompressionSettingsBuilder setAllowSharedDictionary(boolean allowSharedDictionary) { - this.allowSharedDictionary = allowSharedDictionary; - return this; - } + /** + * Allow the Dictionaries to be shared between different column groups. + * + * @param allowSharedDictionary A boolean specifying if the dictionary can be shared between column groups. + * @return The CompressionSettingsBuilder + */ + public CompressionSettingsBuilder setAllowSharedDictionary(boolean allowSharedDictionary) { + this.allowSharedDictionary = allowSharedDictionary; + return this; + } - /** - * Specify if the input matrix should be transposed before compression. This improves cache efficiency while - * compression the input matrix - * - * @param transposeInput string specifying if the input should be transposed before compression, should be one of - * "auto", "true" or "false" - * @return The CompressionSettingsBuilder - */ - public CompressionSettingsBuilder setTransposeInput(String transposeInput) { - switch(transposeInput) { - case "auto": - case "true": - case "false": - this.transposeInput = transposeInput; - break; - default: - throw new DMLCompressionException("Invalid transpose technique"); - } - return this; - } + /** + * Specify if the input matrix should be transposed before compression. This improves cache efficiency while + * compression the input matrix + * + * @param transposeInput string specifying if the input should be transposed before compression, should be one of + * "auto", "true" or "false" + * @return The CompressionSettingsBuilder + */ + public CompressionSettingsBuilder setTransposeInput(String transposeInput) { + switch(transposeInput) { + case "auto": + case "true": + case "false": + this.transposeInput = transposeInput; + break; + default: + throw new DMLCompressionException("Invalid transpose technique"); + } + return this; + } - /** - * Set the seed for the compression operation. - * - * @param seed The seed used in sampling the matrix and general operations in the compression. - * @return The CompressionSettingsBuilder - */ - public CompressionSettingsBuilder setSeed(int seed) { - this.seed = seed; - return this; - } + /** + * Set the seed for the compression operation. + * + * @param seed The seed used in sampling the matrix and general operations in the compression. + * @return The CompressionSettingsBuilder + */ + public CompressionSettingsBuilder setSeed(int seed) { + this.seed = seed; + return this; + } - /** - * Set the valid compression strategies used for the compression. - * - * @param validCompressions An EnumSet of CompressionTypes to use in the compression - * @return The CompressionSettingsBuilder - */ - public CompressionSettingsBuilder setValidCompressions(EnumSet validCompressions) { - // should always contain Uncompressed as an option. - if(!validCompressions.contains(CompressionType.UNCOMPRESSED)) - validCompressions.add(CompressionType.UNCOMPRESSED); - if(!validCompressions.contains(CompressionType.CONST)) - validCompressions.add(CompressionType.CONST); - if(!validCompressions.contains(CompressionType.EMPTY)) - validCompressions.add(CompressionType.EMPTY); - this.validCompressions = validCompressions; - return this; - } + /** + * Set the valid compression strategies used for the compression. + * + * @param validCompressions An EnumSet of CompressionTypes to use in the compression + * @return The CompressionSettingsBuilder + */ + public CompressionSettingsBuilder setValidCompressions(EnumSet validCompressions) { + // should always contain Uncompressed as an option. + if(!validCompressions.contains(CompressionType.UNCOMPRESSED)) + validCompressions.add(CompressionType.UNCOMPRESSED); + if(!validCompressions.contains(CompressionType.CONST)) + validCompressions.add(CompressionType.CONST); + if(!validCompressions.contains(CompressionType.EMPTY)) + validCompressions.add(CompressionType.EMPTY); + this.validCompressions = validCompressions; + return this; + } - /** - * Add a single valid compression type to the EnumSet of valid compressions. - * - * @param cp The compression type to add to the valid ones. - * @return The CompressionSettingsBuilder - */ - public CompressionSettingsBuilder addValidCompression(CompressionType cp) { - this.validCompressions.add(cp); - return this; - } + /** + * Add a single valid compression type to the EnumSet of valid compressions. + * + * @param cp The compression type to add to the valid ones. + * @return The CompressionSettingsBuilder + */ + public CompressionSettingsBuilder addValidCompression(CompressionType cp) { + this.validCompressions.add(cp); + return this; + } - /** - * Clear all the compression types allowed in the compression. This will only allow the Uncompressed ColGroup type. - * Since this is required for operation of the compression - * - * @return The CompressionSettingsBuilder - */ - public CompressionSettingsBuilder clearValidCompression() { - this.validCompressions = EnumSet.of(CompressionType.UNCOMPRESSED, CompressionType.EMPTY, CompressionType.CONST); - return this; - } + /** + * Clear all the compression types allowed in the compression. This will only allow the Uncompressed ColGroup type. + * Since this is required for operation of the compression + * + * @return The CompressionSettingsBuilder + */ + public CompressionSettingsBuilder clearValidCompression() { + this.validCompressions = EnumSet.of(CompressionType.UNCOMPRESSED, CompressionType.EMPTY, CompressionType.CONST); + return this; + } - /** - * Set the type of CoCoding Partitioner type to use for combining columns together. - * - * @param columnPartitioner The Strategy to select from PartitionerType - * @return The CompressionSettingsBuilder - */ - public CompressionSettingsBuilder setColumnPartitioner(PartitionerType columnPartitioner) { - this.columnPartitioner = columnPartitioner; - return this; - } + /** + * Set the type of CoCoding Partitioner type to use for combining columns together. + * + * @param columnPartitioner The Strategy to select from PartitionerType + * @return The CompressionSettingsBuilder + */ + public CompressionSettingsBuilder setColumnPartitioner(PartitionerType columnPartitioner) { + this.columnPartitioner = columnPartitioner; + return this; + } - /** - * Set the maximum number of columns to CoCode together in the CoCoding strategy. Compression time increase with - * higher numbers. - * - * @param maxColGroupCoCode The max selected. - * @return The CompressionSettingsBuilder - */ - public CompressionSettingsBuilder setMaxColGroupCoCode(int maxColGroupCoCode) { - this.maxColGroupCoCode = maxColGroupCoCode; - return this; - } + /** + * Set the maximum number of columns to CoCode together in the CoCoding strategy. Compression time increase with + * higher numbers. + * + * @param maxColGroupCoCode The max selected. + * @return The CompressionSettingsBuilder + */ + public CompressionSettingsBuilder setMaxColGroupCoCode(int maxColGroupCoCode) { + this.maxColGroupCoCode = maxColGroupCoCode; + return this; + } - /** - * Set the coCode percentage, the effect is different based on the coCoding strategy, but the general effect is that - * higher values results in more coCoding while lower values result in less. - * - * Note that with high coCoding the compression ratio would possibly be lower. - * - * @param coCodePercentage The percentage to set. - * @return The CompressionSettingsBuilder - */ - public CompressionSettingsBuilder setCoCodePercentage(double coCodePercentage) { - this.coCodePercentage = coCodePercentage; - return this; - } + /** + * Set the coCode percentage, the effect is different based on the coCoding strategy, but the general effect is that + * higher values results in more coCoding while lower values result in less. + * + * Note that with high coCoding the compression ratio would possibly be lower. + * + * @param coCodePercentage The percentage to set. + * @return The CompressionSettingsBuilder + */ + public CompressionSettingsBuilder setCoCodePercentage(double coCodePercentage) { + this.coCodePercentage = coCodePercentage; + return this; + } - /** - * Set the minimum sample size to extract from a given matrix, this overrules the sample percentage if the sample - * percentage extracted is lower than this minimum bound. - * - * @param minimumSampleSize The minimum sample size to extract - * @return The CompressionSettingsBuilder - */ - public CompressionSettingsBuilder setMinimumSampleSize(int minimumSampleSize) { - this.minimumSampleSize = minimumSampleSize; - return this; - } + /** + * Set the minimum sample size to extract from a given matrix, this overrules the sample percentage if the sample + * percentage extracted is lower than this minimum bound. + * + * @param minimumSampleSize The minimum sample size to extract + * @return The CompressionSettingsBuilder + */ + public CompressionSettingsBuilder setMinimumSampleSize(int minimumSampleSize) { + this.minimumSampleSize = minimumSampleSize; + return this; + } - /** - * Set the maximum sample size to extract from a given matrix, this overrules the sample percentage if the sample - * percentage extracted is higher than this maximum bound. - * - * @param maxSampleSize The maximum sample size to extract - * @return The CompressionSettingsBuilder - */ - public CompressionSettingsBuilder setMaxSampleSize(int maxSampleSize) { - this.maxSampleSize = maxSampleSize; - return this; - } + /** + * Set the maximum sample size to extract from a given matrix, this overrules the sample percentage if the sample + * percentage extracted is higher than this maximum bound. + * + * @param maxSampleSize The maximum sample size to extract + * @return The CompressionSettingsBuilder + */ + public CompressionSettingsBuilder setMaxSampleSize(int maxSampleSize) { + this.maxSampleSize = maxSampleSize; + return this; + } - /** - * Set the estimation type used for the sampled estimates. - * - * @param estimationType the estimation type in used. - * @return The CompressionSettingsBuilder - */ - public CompressionSettingsBuilder setEstimationType(EstimationType estimationType) { - this.estimationType = estimationType; - return this; - } + /** + * Set the estimation type used for the sampled estimates. + * + * @param estimationType the estimation type in used. + * @return The CompressionSettingsBuilder + */ + public CompressionSettingsBuilder setEstimationType(EstimationType estimationType) { + this.estimationType = estimationType; + return this; + } - /** - * Set the cost type used for estimating the cost of column groups default is memory based. - * - * @param costType The Cost type wanted - * @return The CompressionSettingsBuilder - */ - public CompressionSettingsBuilder setCostType(CostType costType) { - this.costType = costType; - return this; - } + /** + * Set the cost type used for estimating the cost of column groups default is memory based. + * + * @param costType The Cost type wanted + * @return The CompressionSettingsBuilder + */ + public CompressionSettingsBuilder setCostType(CostType costType) { + this.costType = costType; + return this; + } - /** - * Set the minimum compression ratio to be achieved by the compression. - * - * @param ratio The ratio to achieve while compressing - * @return The CompressionSettingsBuilder - */ - public CompressionSettingsBuilder setMinimumCompressionRatio(double ratio) { - this.minimumCompressionRatio = ratio; - return this; - } + /** + * Set the minimum compression ratio to be achieved by the compression. + * + * @param ratio The ratio to achieve while compressing + * @return The CompressionSettingsBuilder + */ + public CompressionSettingsBuilder setMinimumCompressionRatio(double ratio) { + this.minimumCompressionRatio = ratio; + return this; + } - /** - * Inform the compression that it is executed in a spark instruction. - * - * @return The CompressionSettingsBuilder - */ - public CompressionSettingsBuilder setIsInSparkInstruction() { - this.isInSparkInstruction = true; - return this; - } + /** + * Inform the compression that it is executed in a spark instruction. + * + * @return The CompressionSettingsBuilder + */ + public CompressionSettingsBuilder setIsInSparkInstruction() { + this.isInSparkInstruction = true; + return this; + } - /** - * Set the sort type to use. - * - * @param sdcSortType The sort type for the construction of SDC groups - * @return The CompressionSettingsBuilder - */ - public CompressionSettingsBuilder setSDCSortType(SORT_TYPE sdcSortType) { - this.sdcSortType = sdcSortType; - return this; - } + /** + * Set the sort type to use. + * + * @param sdcSortType The sort type for the construction of SDC groups + * @return The CompressionSettingsBuilder + */ + public CompressionSettingsBuilder setSDCSortType(SORT_TYPE sdcSortType) { + this.sdcSortType = sdcSortType; + return this; + } - /** - * Set whether to prefer delta encoding during compression estimation. - * When enabled, the compression estimator will use delta encoding statistics - * instead of regular encoding statistics. - * - * @param preferDeltaEncoding Whether to prefer delta encoding - * @return The CompressionSettingsBuilder - */ - public CompressionSettingsBuilder setPreferDeltaEncoding(boolean preferDeltaEncoding) { - this.preferDeltaEncoding = preferDeltaEncoding; - return this; - } + /** + * Set whether to prefer delta encoding during compression estimation. + * When enabled, the compression estimator will use delta encoding statistics + * instead of regular encoding statistics. + * + * @param preferDeltaEncoding Whether to prefer delta encoding + * @return The CompressionSettingsBuilder + */ + public CompressionSettingsBuilder setPreferDeltaEncoding(boolean preferDeltaEncoding) { + this.preferDeltaEncoding = preferDeltaEncoding; + return this; + } - /** - * Create the CompressionSettings object to use in the compression. - * - * @return The CompressionSettings - */ - public CompressionSettings create() { - return new CompressionSettings(samplingRatio, samplePower, allowSharedDictionary, transposeInput, seed, lossy, - validCompressions, sortValuesByLength, columnPartitioner, maxColGroupCoCode, coCodePercentage, - minimumSampleSize, maxSampleSize, estimationType, costType, minimumCompressionRatio, isInSparkInstruction, - sdcSortType, scaleFactors, preferDeltaEncoding); - } -} \ No newline at end of file + /** + * Create the CompressionSettings object to use in the compression. + * + * @return The CompressionSettings + */ + public CompressionSettings create() { + return new CompressionSettings(samplingRatio, samplePower, allowSharedDictionary, transposeInput, seed, lossy, + validCompressions, sortValuesByLength, columnPartitioner, maxColGroupCoCode, coCodePercentage, + minimumSampleSize, maxSampleSize, estimationType, costType, minimumCompressionRatio, isInSparkInstruction, + sdcSortType, scaleFactors, preferDeltaEncoding); + } +} From 898af6892e3276303b9f37cc45c40d289ccbbc37 Mon Sep 17 00:00:00 2001 From: mori49 <84979219+mori49@users.noreply.github.com> Date: Thu, 5 Feb 2026 16:26:12 +0100 Subject: [PATCH 15/35] rm: comment reformatted and add targetloss handling --- .../apache/sysds/runtime/compress/CompressionSettings.java | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/main/java/org/apache/sysds/runtime/compress/CompressionSettings.java b/src/main/java/org/apache/sysds/runtime/compress/CompressionSettings.java index 7d5a1dac51a..99c4b9c2ecb 100644 --- a/src/main/java/org/apache/sysds/runtime/compress/CompressionSettings.java +++ b/src/main/java/org/apache/sysds/runtime/compress/CompressionSettings.java @@ -136,10 +136,8 @@ public class CompressionSettings { public final boolean preferDeltaEncoding; - /** - * Ziel-Gesantverlust für piecewise Lineace Komocession• Interpretation: maximal entaubter Alobaler MSE pro Went in - * der Sealte. O.O ~ quasi verlustfrei, viele Segmente >0 ~ mehr Approximation entaubt, weniger Segmente - */ + // Handling Targetloss for piecewise linear Kompression + private double piecewiseTargetLoss = Double.NaN; public void setPiecewiseTargetLoss(double piecewiseTargetLoss) { From d8ebc9fd50deb7910036e5eaf234f64c9d6f3f78 Mon Sep 17 00:00:00 2001 From: mori49 <84979219+mori49@users.noreply.github.com> Date: Thu, 5 Feb 2026 16:27:16 +0100 Subject: [PATCH 16/35] fix: reverted file and add enum CompressionTypepiecewiseLinear --- .../runtime/compress/colgroup/AColGroup.java | 232 +++++++++--------- 1 file changed, 114 insertions(+), 118 deletions(-) diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/AColGroup.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/AColGroup.java index d761af7667a..e2bf69f5c15 100644 --- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/AColGroup.java +++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/AColGroup.java @@ -55,7 +55,7 @@ /** * Abstract Class that is the lowest class type for the Compression framework. - * + * * AColGroup store information about a number of columns. * */ @@ -64,7 +64,6 @@ public abstract class AColGroup implements Serializable { private static final long serialVersionUID = -1318908671481L; /** Public super types of compression ColGroups supported */ - // Enum hinzugefügt -> Brauche ich aber auch das im ColGroupType Enum ergänzen? public static enum CompressionType { UNCOMPRESSED, RLE, OLE, DDC, CONST, EMPTY, SDC, SDCFOR, DDCFOR, DeltaDDC, LinearFunctional, PiecewiseLinear; @@ -83,7 +82,7 @@ public boolean isSDC() { /** * Concrete ColGroupType - * + * * Protected such that outside the ColGroup package it should be unknown which specific subtype is used. */ protected static enum ColGroupType { @@ -96,7 +95,7 @@ protected static enum ColGroupType { /** * Main constructor. - * + * * @param colIndices offsets of the columns in the matrix block that make up the group */ protected AColGroup(IColIndex colIndices) { @@ -105,7 +104,7 @@ protected AColGroup(IColIndex colIndices) { /** * Obtain the offsets of the columns in the matrix block that make up the group - * + * * @return offsets of the columns in the matrix block that make up the group */ public final IColIndex getColIndices() { @@ -114,7 +113,7 @@ public final IColIndex getColIndices() { /** * Obtain the number of columns in this column group. - * + * * @return number of columns in this column group */ public final int getNumCols() { @@ -125,9 +124,9 @@ public final int getNumCols() { * Shift all column indexes contained by an offset. * * This is used for rbind to combine compressed matrices. - * + * * Since column indexes are reused between operations, we allocate a new list here to be safe - * + * * @param offset The offset to move all columns * @return A new column group object with the shifted columns */ @@ -139,7 +138,7 @@ public final AColGroup shiftColIndices(int offset) { * Copy the content of the column group with pointers to the previous content but with new column given Note this * method does not verify if the colIndexes specified are valid and correct dimensions for the underlying column * groups. - * + * * @param colIndexes the new indexes to use in the copy * @return a new object with pointers to underlying data. */ @@ -147,7 +146,7 @@ public final AColGroup shiftColIndices(int offset) { /** * Get the upper bound estimate of in memory allocation for the column group. - * + * * @return an upper bound on the number of bytes used to store this ColGroup in memory. */ public long estimateInMemorySize() { @@ -158,9 +157,9 @@ public long estimateInMemorySize() { /** * Decompress a range of rows into a sparse block - * + * * Note that this is using append, so the sparse column indexes need to be sorted afterwards. - * + * * @param sb Sparse Target block * @param rl Row to start at * @param ru Row to end at @@ -171,7 +170,7 @@ public final void decompressToSparseBlock(SparseBlock sb, int rl, int ru) { /** * Decompress a range of rows into a dense block - * + * * @param db Dense target block * @param rl Row to start at * @param ru Row to end at @@ -182,7 +181,7 @@ public final void decompressToDenseBlock(DenseBlock db, int rl, int ru) { /** * Decompress a range of rows into a dense transposed block. - * + * * @param db Dense target block * @param rl Row in this column group to start at. * @param ru Row in this column group to end at. @@ -192,7 +191,7 @@ public final void decompressToDenseBlock(DenseBlock db, int rl, int ru) { /** * Decompress the column group to the sparse transposed block. Note that the column groups would only need to * decompress into specific sub rows of the Sparse block - * + * * @param sb Sparse target block * @param nColOut The number of columns in the sb. */ @@ -200,7 +199,7 @@ public final void decompressToDenseBlock(DenseBlock db, int rl, int ru) { /** * Serializes column group to data output. - * + * * @param out data output * @throws IOException if IOException occurs */ @@ -213,7 +212,7 @@ protected void write(DataOutput out) throws IOException { /** * Returns the exact serialized size of column group. This can be used for example for buffer preallocation. - * + * * @return exact serialized size for column group */ public long getExactSizeOnDisk() { @@ -226,11 +225,11 @@ public long getExactSizeOnDisk() { /** * Slice out the columns within the range of cl and cu to remove the dictionary values related to these columns. If * the ColGroup slicing from does not contain any columns within the range null is returned. - * + * * @param cl The lower bound of the columns to select * @param cu The upper bound of the columns to select (not inclusive). * @return A cloned Column Group, with a copied pointer to the old column groups index structure, but reduced - * dictionary and _columnIndexes correctly aligned with the expected sliced compressed matrix. + * dictionary and _columnIndexes correctly aligned with the expected sliced compressed matrix. */ public final AColGroup sliceColumns(int cl, int cu) { if(cl <= _colIndexes.get(0) && cu > _colIndexes.get(_colIndexes.size() - 1)) { @@ -248,10 +247,10 @@ else if(cu - cl == 1) /** * Slice out a single column from the column group. - * + * * @param col The column to slice, the column could potentially not be inside the column group * @return A new column group that is a single column, if the column requested is not in this column group null is - * returned. + * returned. */ public final AColGroup sliceColumn(int col) { int idx = _colIndexes.findIndex(col); @@ -263,11 +262,11 @@ public final AColGroup sliceColumn(int col) { /** * Slice out multiple columns within the interval between the given indexes. - * + * * @param cl The lower column index to slice from * @param cu The upper column index to slice to, (not included) * @return A column group of this containing the columns specified, returns null if the columns specified is not - * contained in the column group + * contained in the column group */ protected final AColGroup sliceMultiColumns(int cl, int cu) { SliceResult sr = _colIndexes.slice(cl, cu); @@ -279,7 +278,7 @@ protected final AColGroup sliceMultiColumns(int cl, int cu) { /** * Compute the column sum of the given list of groups - * + * * @param groups The Groups to sum * @param res The result to put the values into * @param nRows The number of rows in the groups @@ -293,9 +292,9 @@ public static double[] colSum(Collection groups, double[] res, int nR /** * Get the value at a global row/column position. - * + * * In general this performs since a binary search of colIndexes is performed for each lookup. - * + * * @param r row * @param c column * @return value at the row/column position @@ -310,7 +309,7 @@ public double get(int r, int c) { /** * Get the value at a colGroup specific row/column index position. - * + * * @param r row * @param colIdx column index in the _colIndexes. * @return value at the row/column index position @@ -319,16 +318,16 @@ public double get(int r, int c) { /** * Obtain number of distinct tuples in contained sets of values associated with this column group. - * + * * If the column group is uncompressed the number or rows is returned. - * + * * @return the number of distinct sets of values associated with the bitmaps in this column group */ public abstract int getNumValues(); /** * Obtain the compression type. - * + * * @return How the elements of the column group are compressed. */ public abstract CompressionType getCompType(); @@ -336,14 +335,14 @@ public double get(int r, int c) { /** * Internally get the specific type of ColGroup, this could be extracted from the object but that does not allow for * nice switches in the code. - * + * * @return ColGroupType of the object. */ protected abstract ColGroupType getColGroupType(); /** * Decompress into the DenseBlock. (no NNZ handling) - * + * * @param db Target DenseBlock * @param rl Row to start decompression from * @param ru Row to end decompression at (not inclusive) @@ -354,10 +353,10 @@ public double get(int r, int c) { /** * Decompress into the SparseBlock. (no NNZ handling) - * + * * Note this method is allowing to calls to append since it is assumed that the sparse column indexes are sorted * afterwards - * + * * @param sb Target SparseBlock * @param rl Row to start decompression from * @param ru Row to end decompression at (not inclusive) @@ -368,9 +367,9 @@ public double get(int r, int c) { /** * Right matrix multiplication with this column group. - * + * * This method can return null, meaning that the output overlapping group would have been empty. - * + * * @param right The MatrixBlock on the right of this matrix multiplication * @return The new Column Group or null that is the result of the matrix multiplication. */ @@ -380,9 +379,9 @@ public final AColGroup rightMultByMatrix(MatrixBlock right) { /** * Right matrix multiplication with this column group. - * + * * This method can return null, meaning that the output overlapping group would have been empty. - * + * * @param right The MatrixBlock on the right of this matrix multiplication * @param allCols A pre-materialized list of all col indexes, that can be shared across all column groups if use * full, can be set to null. @@ -393,7 +392,7 @@ public final AColGroup rightMultByMatrix(MatrixBlock right) { /** * Right side Matrix multiplication, iterating though this column group and adding to the ret - * + * * @param right Right side matrix to multiply with. * @param ret The return matrix to add results to * @param rl The row of this column group to multiply from @@ -402,20 +401,18 @@ public final AColGroup rightMultByMatrix(MatrixBlock right) { * @param cru The right hand side column upper * @param nRows The number of rows in this column group */ - public void rightDecompressingMult(MatrixBlock right, MatrixBlock ret, int rl, int ru, int nRows, int crl, - int cru) { - throw new NotImplementedException( - "not supporting right Decompressing Multiply on class: " + this.getClass().getSimpleName()); + public void rightDecompressingMult(MatrixBlock right, MatrixBlock ret, int rl, int ru, int nRows, int crl, int cru){ + throw new NotImplementedException("not supporting right Decompressing Multiply on class: " + this.getClass().getSimpleName()); } /** * Do a transposed self matrix multiplication on the left side t(x) %*% x. but only with this column group. - * + * * This gives better performance since there is no need to iterate through all the rows of the matrix, but the * execution can be limited to its number of distinct values. - * + * * Note it only calculate the upper triangle - * + * * @param ret The return matrix block [numColumns x numColumns] * @param nRows The number of rows in the column group */ @@ -423,7 +420,7 @@ public void rightDecompressingMult(MatrixBlock right, MatrixBlock ret, int rl, i /** * Left multiply with this column group. - * + * * @param matrix The matrix to multiply with on the left * @param result The result to output the values into, always dense for the purpose of the column groups * parallelizing @@ -437,7 +434,7 @@ public abstract void leftMultByMatrixNoPreAgg(MatrixBlock matrix, MatrixBlock re /** * Left side matrix multiplication with a column group that is transposed. - * + * * @param lhs The left hand side Column group to multiply with, the left hand side should be considered * transposed. Also it should be guaranteed that this column group is not empty. * @param result The result matrix to insert the result of the multiplication into @@ -447,16 +444,16 @@ public abstract void leftMultByMatrixNoPreAgg(MatrixBlock matrix, MatrixBlock re /** * Matrix multiply with this other column group, but: - * + * * 1. Only output upper triangle values. - * + * * 2. Multiply both ways with "this" being on the left and on the right. - * + * * It should be guaranteed that the input is not the same as the caller of the method. - * + * * The second step is achievable by treating the initial multiplied matrix, and adding its values to the correct * locations in the output. - * + * * @param other The other Column group to multiply with * @param result The result matrix to put the results into */ @@ -465,7 +462,7 @@ public abstract void leftMultByMatrixNoPreAgg(MatrixBlock matrix, MatrixBlock re /** * Perform the specified scalar operation directly on the compressed column group, without decompressing individual * cells if possible. - * + * * @param op operation to perform * @return version of this column group with the operation applied */ @@ -473,7 +470,7 @@ public abstract void leftMultByMatrixNoPreAgg(MatrixBlock matrix, MatrixBlock re /** * Perform a binary row operation. - * + * * @param op The operation to execute * @param v The vector of values to apply the values contained should be at least the length of the highest * value in the column index @@ -484,7 +481,7 @@ public abstract void leftMultByMatrixNoPreAgg(MatrixBlock matrix, MatrixBlock re /** * Short hand add operator call on column group to add a row vector to the column group - * + * * @param v The vector to add * @return A new column group where the vector is added. */ @@ -494,7 +491,7 @@ public AColGroup addVector(double[] v) { /** * Perform a binary row operation. - * + * * @param op The operation to execute * @param v The vector of values to apply the values contained should be at least the length of the highest * value in the column index @@ -506,9 +503,9 @@ public AColGroup addVector(double[] v) { /** * Unary Aggregate operator, since aggregate operators require new object output, the output becomes an uncompressed * matrix. - * + * * The range of rl to ru only applies to row aggregates. (ReduceCol) - * + * * @param op The operator used * @param c The output matrix block * @param nRows The total number of rows in the Column Group @@ -519,9 +516,9 @@ public AColGroup addVector(double[] v) { /** * Slice out column at specific index of this column group. - * + * * It is guaranteed that the column to slice is contained in this columnGroup. - * + * * @param idx The column index to slice out. * @return A new column group containing the columns inside. (never null) */ @@ -529,9 +526,9 @@ public AColGroup addVector(double[] v) { /** * Slice range of columns inside this column group. - * + * * It is guaranteed that the columns to slice is contained in this columnGroup. - * + * * @param idStart The column index to start at * @param idEnd The column index to end at (not included) * @param outputCols The output columns to extract materialized for ease of implementation @@ -541,10 +538,9 @@ public AColGroup addVector(double[] v) { /** * Slice range of rows out of the column group and return a new column group only containing the row segment. - * - * Note that this slice should maintain pointers back to the original dictionaries and only modify index - * structures. - * + * + * Note that this slice should maintain pointers back to the original dictionaries and only modify index structures. + * * @param rl The row to start at * @param ru The row to end at (not included) * @return A new column group containing the specified row range. @@ -553,21 +549,21 @@ public AColGroup addVector(double[] v) { /** * Short hand method for getting minimum value contained in this column group. - * + * * @return The minimum value contained in this ColumnGroup */ public abstract double getMin(); /** * Short hand method for getting maximum value contained in this column group. - * + * * @return The maximum value contained in this ColumnGroup */ public abstract double getMax(); /** * Short hand method for getting the sum of this column group - * + * * @param nRows The number of rows in the column group * @return The sum of this column group */ @@ -575,7 +571,7 @@ public AColGroup addVector(double[] v) { /** * Detect if the column group contains a specific value. - * + * * @param pattern The value to look for. * @return boolean saying true if the value is contained. */ @@ -583,7 +579,7 @@ public AColGroup addVector(double[] v) { /** * Get the number of nonZeros contained in this column group. - * + * * @param nRows The number of rows in the column group, this is used for groups that does not contain information * about how many rows they have. * @return The nnz. @@ -592,7 +588,7 @@ public AColGroup addVector(double[] v) { /** * Make a copy of the column group values, and replace all values that match pattern with replacement value. - * + * * @param pattern The value to look for * @param replace The value to replace the other value with * @return A new Column Group, reusing the index structure but with new values. @@ -601,7 +597,7 @@ public AColGroup addVector(double[] v) { /** * Compute the column sum - * + * * @param c The array to add the column sum to. * @param nRows The number of rows in the column group. */ @@ -609,7 +605,7 @@ public AColGroup addVector(double[] v) { /** * Central Moment instruction executed on a column group. - * + * * @param op The Operator to use. * @param nRows The number of rows contained in the ColumnGroup. * @return A Central Moment object. @@ -618,7 +614,7 @@ public AColGroup addVector(double[] v) { /** * Expand the column group to multiple columns. (one hot encode the column group) - * + * * @param max The number of columns to expand to and cutoff values at. * @param ignore If zero and negative values should be ignored. * @param cast If the double values contained should be cast to whole numbers. @@ -629,7 +625,7 @@ public AColGroup addVector(double[] v) { /** * Get the computation cost associated with this column group. - * + * * @param e The computation cost estimator * @param nRows the number of rows in the column group * @return The cost of this column group @@ -638,7 +634,7 @@ public AColGroup addVector(double[] v) { /** * Perform unary operation on the column group and return a new column group - * + * * @param op The operation to perform * @return The new column group */ @@ -646,19 +642,19 @@ public AColGroup addVector(double[] v) { /** * Get if the group is only containing zero - * + * * @return true if empty */ public abstract boolean isEmpty(); /** - * Append the other column group to this column group. This method tries to combine them to return a new column - * group containing both. In some cases it is possible in reasonable time, in others it is not. - * + * Append the other column group to this column group. This method tries to combine them to return a new column group + * containing both. In some cases it is possible in reasonable time, in others it is not. + * * The result is first this column group followed by the other column group in higher row values. - * + * * If it is not possible or very inefficient null is returned. - * + * * @param g The other column group * @return A combined column group or null */ @@ -666,9 +662,9 @@ public AColGroup addVector(double[] v) { /** * Append all column groups in the list provided together in one go allocating the output once. - * + * * If it is not possible or very inefficient null is returned. - * + * * @param groups The groups to combine. * @param blen The normal number of rows in the groups * @param rlen The total number of rows of all combined. @@ -680,11 +676,11 @@ public static AColGroup appendN(AColGroup[] groups, int blen, int rlen) { /** * Append all column groups in the list provided together with this. - * + * * A Important detail is the first entry in the group == this, and should not be appended twice. - * + * * If it is not possible or very inefficient null is returned. - * + * * @param groups The groups to combine. * @param blen The normal number of rows in the groups * @param rlen The total number of rows of all combined. @@ -694,7 +690,7 @@ public static AColGroup appendN(AColGroup[] groups, int blen, int rlen) { /** * Get the compression scheme for this column group to enable compression of other data. - * + * * @return The compression scheme of this column group */ public abstract ICLAScheme getCompressionScheme(); @@ -708,14 +704,14 @@ public void clear() { /** * Recompress this column group into a new column group. - * + * * @return A new or the same column group depending on optimization goal. */ public abstract AColGroup recompress(); /** * Recompress this column group into a new column group of the given type. - * + * * @param ct The compressionType that the column group should morph into * @param nRow The number of rows in this columngroup. * @return A new column group @@ -745,7 +741,7 @@ else if(ct == CompressionType.UNCOMPRESSED) { /** * Get the compression info for this column group. - * + * * @param nRow The number of rows in this column group. * @return The compression info for this group. */ @@ -753,7 +749,7 @@ else if(ct == CompressionType.UNCOMPRESSED) { /** * Combine this column group with another - * + * * @param other The other column group to combine with. * @param nRow The number of rows in both column groups. * @return A combined representation as a column group. @@ -764,7 +760,7 @@ public AColGroup combine(AColGroup other, int nRow) { /** * Get encoding of this column group. - * + * * @return The encoding of the index structure. */ public IEncode getEncoding() { @@ -785,19 +781,19 @@ public AColGroup sortColumnIndexes() { /** * Perform row sum on the internal dictionaries, and return the same index structure. - * + * * This method returns null on empty column groups. - * + * * Note this method does not guarantee correct behavior if the given group is AMorphingGroup, instead it should be * morphed to a valid columngroup via extractCommon first. - * + * * @return The reduced colgroup. */ public abstract AColGroup reduceCols(); /** * Selection (left matrix multiply) - * + * * @param selection A sparse matrix with "max" a single one in each row all other values are zero. * @param points The coordinates in the selection matrix to extract. * @param ret The MatrixBlock to decompress the selected rows into @@ -810,17 +806,17 @@ public final void selectionMultiply(MatrixBlock selection, P[] points, MatrixBlo else denseSelection(selection, points, ret, rl, ru); } - + /** * Get an approximate sparsity of this column group - * + * * @return the approximate sparsity of this columngroup */ public abstract double getSparsity(); /** * Sparse selection (left matrix multiply) - * + * * @param selection A sparse matrix with "max" a single one in each row all other values are zero. * @param points The coordinates in the selection matrix to extract. * @param ret The Sparse MatrixBlock to decompress the selected rows into @@ -831,7 +827,7 @@ public final void selectionMultiply(MatrixBlock selection, P[] points, MatrixBlo /** * Dense selection (left matrix multiply) - * + * * @param selection A sparse matrix with "max" a single one in each row all other values are zero. * @param points The coordinates in the selection matrix to extract. * @param ret The Dense MatrixBlock to decompress the selected rows into @@ -843,7 +839,7 @@ public final void selectionMultiply(MatrixBlock selection, P[] points, MatrixBlo /** * Method to determine if the columnGroup have the same index structure as another. Note that the column indexes and * dictionaries are allowed to be different. - * + * * @param that the other column group * @return if the index is the same. */ @@ -854,7 +850,7 @@ public boolean sameIndexStructure(AColGroup that) { /** * C bind the list of column groups with this column group. the list of elements provided in the index of each list * is guaranteed to have the same index structures - * + * * @param nRow The number of rows contained in all right and this column group. * @param nCol The number of columns to shift the right hand side column groups over when combining, this should * only effect the column indexes @@ -892,7 +888,7 @@ public AColGroup combineWithSameIndex(int nRow, int nCol, List right) /** * C bind the given column group to this. - * + * * @param nRow The number of rows contained in the right and this column group. * @param nCol The number of columns in this. * @param right The column group to c-bind. @@ -932,16 +928,16 @@ protected IColIndex combineColIndexes(final int nCol, List right) { /** * This method returns a list of column groups that are naive splits of this column group as if it is reshaped. - * + * * This means the column groups rows are split into x number of other column groups where x is the multiplier. - * + * * The indexes are assigned round robbin to each of the output groups, meaning the first index is assigned. - * + * * If for instance the 4. column group is split by a 2 multiplier and there was 5 columns in total originally. The * output becomes 2 column groups at column index 4 and one at 9. - * + * * If possible the split column groups should reuse pointers back to the original dictionaries! - * + * * @param multiplier The number of column groups to split into * @param nRow The number of rows in this column group in case the underlying column group does not know * @param nColOrg The number of overall columns in the host CompressedMatrixBlock. @@ -951,25 +947,25 @@ protected IColIndex combineColIndexes(final int nCol, List right) { /** * This method returns a list of column groups that are naive splits of this column group as if it is reshaped. - * + * * This means the column groups rows are split into x number of other column groups where x is the multiplier. - * + * * The indexes are assigned round robbin to each of the output groups, meaning the first index is assigned. - * + * * If for instance the 4. column group is split by a 2 multiplier and there was 5 columns in total originally. The * output becomes 2 column groups at column index 4 and one at 9. - * + * * If possible the split column groups should reuse pointers back to the original dictionaries! - * + * * This specific variation is pushing down the parallelization given via the executor service provided. If not * overwritten the default is to call the normal split reshape - * + * * @param multiplier The number of column groups to split into * @param nRow The number of rows in this column group in case the underlying column group does not know * @param nColOrg The number of overall columns in the host CompressedMatrixBlock * @param pool The executor service to submit parallel tasks to - * @return a list of split column groups * @throws Exception In case there is an error we throw the exception out instead of handling it + * @return a list of split column groups */ public AColGroup[] splitReshapePushDown(final int multiplier, final int nRow, final int nColOrg, final ExecutorService pool) throws Exception { From 36d318673889247d7abad169b15c5911d75be9c6 Mon Sep 17 00:00:00 2001 From: mori49 <84979219+mori49@users.noreply.github.com> Date: Thu, 5 Feb 2026 16:27:32 +0100 Subject: [PATCH 17/35] fix: reverted file --- .../component/compress/colgroup/ColGroupFactoryTest.java | 5 ----- 1 file changed, 5 deletions(-) diff --git a/src/test/java/org/apache/sysds/test/component/compress/colgroup/ColGroupFactoryTest.java b/src/test/java/org/apache/sysds/test/component/compress/colgroup/ColGroupFactoryTest.java index a1a5c8a6794..c4da48a0232 100644 --- a/src/test/java/org/apache/sysds/test/component/compress/colgroup/ColGroupFactoryTest.java +++ b/src/test/java/org/apache/sysds/test/component/compress/colgroup/ColGroupFactoryTest.java @@ -19,10 +19,8 @@ package org.apache.sysds.test.component.compress.colgroup; -import static org.apache.sysds.runtime.compress.colgroup.ColGroupFactory.computeSegmentCost; import static org.junit.Assert.assertEquals; import static org.junit.Assert.fail; -import static org.junit.jupiter.api.Assertions.assertTrue; import java.util.ArrayList; import java.util.Collection; @@ -53,7 +51,6 @@ @RunWith(value = Parameterized.class) public class ColGroupFactoryTest { - private final MatrixBlock mb; private final MatrixBlock mbt; private final ACostEstimate ce; @@ -330,7 +327,5 @@ public boolean isContiguous() { public int numBlocks() { return 2; } - - } } From a0d08d708d66c03fa4df769d073cdf4c3b4837bb Mon Sep 17 00:00:00 2001 From: mori49 <84979219+mori49@users.noreply.github.com> Date: Thu, 5 Feb 2026 22:51:18 +0100 Subject: [PATCH 18/35] fix: repeated compression on every column extract: methods for compressPiecewiseLinearCompression --- .../compress/colgroup/ColGroupFactory.java | 206 +++--------------- 1 file changed, 34 insertions(+), 172 deletions(-) diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupFactory.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupFactory.java index 67f2c492e09..b51111a4aba 100644 --- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupFactory.java +++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupFactory.java @@ -43,6 +43,7 @@ import org.apache.sysds.runtime.compress.colgroup.dictionary.DictionaryFactory; import org.apache.sysds.runtime.compress.colgroup.dictionary.IDictionary; import org.apache.sysds.runtime.compress.colgroup.functional.LinearRegression; +import org.apache.sysds.runtime.compress.colgroup.functional.PiecewiseLinearUtils; import org.apache.sysds.runtime.compress.colgroup.indexes.ColIndexFactory; import org.apache.sysds.runtime.compress.colgroup.indexes.IColIndex; import org.apache.sysds.runtime.compress.colgroup.insertionsort.AInsertionSorter; @@ -51,7 +52,6 @@ import org.apache.sysds.runtime.compress.colgroup.mapping.MapToFactory; import org.apache.sysds.runtime.compress.colgroup.offset.AOffset; import org.apache.sysds.runtime.compress.colgroup.offset.OffsetFactory; -import org.apache.sysds.runtime.compress.colgroup.scheme.ColGroupPiecewiseLinearCompressed; import org.apache.sysds.runtime.compress.cost.ACostEstimate; import org.apache.sysds.runtime.compress.estim.CompressedSizeInfo; import org.apache.sysds.runtime.compress.estim.CompressedSizeInfoColGroup; @@ -306,9 +306,7 @@ else if(ct == CompressionType.LinearFunctional) { } } else if(ct == CompressionType.PiecewiseLinear) { - return compressPiecewiseLinearFunctional(colIndexes, in, cs); - } else if(ct == CompressionType.DDCFOR) { AColGroup g = directCompressDDC(colIndexes, cg); @@ -1074,178 +1072,42 @@ private static AColGroup compressLinearFunctional(IColIndex colIndexes, MatrixBl return ColGroupLinearFunctional.create(colIndexes, coefficients, numRows); } - public static AColGroup compressPiecewiseLinearFunctional(IColIndex colIndexes, MatrixBlock in, - CompressionSettings cs) { - - //Erstmal den Inhalt einer Spalte speichern - - int numRows = in.getNumRows(); - int colIdx = colIndexes.get(0); //Die erste Spalte - double[] column = getColumn(in, colIdx); - - //Sette den Targetloss - - // Breakpoints bestimmen: Einteilung der Segmente - - List breakpointsList = computeBreakpoints(cs, column); - int[] breakpoints = breakpointsList.stream().mapToInt(Integer::intValue).toArray(); - //Für jedes Segment lineare Regression als kompressionsverfahren - - // 3) Pro Segment Regression -> a,b - int numSeg = breakpoints.length - 1; - double[] slopes = new double[numSeg]; - double[] intercepts = new double[numSeg]; - - for(int s = 0; s < numSeg; s++) { - int start = breakpoints[s]; - int end = breakpoints[s + 1]; - - double[] ab = regressSegment(column, start, end); // nutzt gleiche Stats wie computeSegmentCost - slopes[s] = ab[0]; - intercepts[s] = ab[1]; - } - //Erstelle die Datenstruktur: PiecewiseLinearColGroupCompressed - - return ColGroupPiecewiseLinearCompressed.create(colIndexes, breakpoints, slopes, intercepts, numRows); - } - - public static double[] getColumn(MatrixBlock in, int colIndex) { - int numRows = in.getNumRows(); // Anzahl der Zeilen [web:16] - double[] column = new double[numRows]; // Variable für die Spalte - - for(int r = 0; r < numRows; r++) { - column[r] = in.get(r, colIndex); // Wert (r, colIndex) lesen [web:16][web:25] - } - return column; - } - - public static List computeBreakpoints(CompressionSettings cs, double[] column) { - int n = column.length; - double targetMSE = cs.getPiecewiseTargetLoss(); - // Fall A: kein TargetLoss angegeben -> einfache Variante mit fixem λ - if(Double.isNaN(targetMSE) || targetMSE <= 0) { - double lambda = 5.0; - return computeBreakpointsLambda(column, lambda); - } - - // Fall B: TargetLoss gesetzt -> globales Fehlerbudget berücksichtigen - double sseMax = n * targetMSE; // MSE -> SSE-Budget - - double lambdaMin = 0.0; // viele Segmente, minimaler Fehler - double lambdaMax = 1e6; // wenige Segmente, mehr Fehler - - List bestBreaks = null; - - for(int it = 0; it < 20; it++) { // Binärsuche auf λ - double lambda = 0.5 * (lambdaMin + lambdaMax); - - List breaks = computeBreakpointsLambda(column, lambda); - double totalSSE = computeTotalSSE(column, breaks); - - if(totalSSE <= sseMax) { - // Budget eingehalten: wir können versuchen, mit größerem λ noch weniger Segmente zu nehmen - bestBreaks = breaks; - lambdaMin = lambda; - } - else { - // Fehler zu groß: λ verkleinern, mehr Segmente zulassen - lambdaMax = lambda; - } - } - - if(bestBreaks == null) - bestBreaks = computeBreakpointsLambda(column, lambdaMin); - - return bestBreaks; - } - - public static List computeBreakpointsLambda(double[] column, double lambda) { - int sizeColumn = column.length; - double[] dp = new double[sizeColumn + 1]; - int[] prev = new int[sizeColumn + 1]; - - dp[0] = 0.0; - - for(int index = 1; index <= sizeColumn; index++) { - dp[index] = Double.POSITIVE_INFINITY; - for(int i = 0; i < index; i++) { // Segment [i, index) - double costCurrentSegment = computeSegmentCost(column, i, index); // SSE - double candidateCost = dp[i] + costCurrentSegment + lambda; - if(candidateCost < dp[index]) { - dp[index] = candidateCost; - prev[index] = i; - } + public static AColGroup compressPiecewiseLinearFunctional( + IColIndex colIndexes, MatrixBlock in, CompressionSettings cs) { + + final int numRows = in.getNumRows(); + AColGroup result = null; + + //Compress every column + for (int col = 0; col < colIndexes.size(); col++) { + // get Column Index + IColIndex.SliceResult sliceResult = colIndexes.slice(col, col + 1); + IColIndex singleColIndex = sliceResult.ret; // ← .ret nötig! + + // Get Column from Matrix + final int colIdx = colIndexes.get(col); + double[] column = PiecewiseLinearUtils.getColumn(in, colIdx); + + //Compress column + PiecewiseLinearUtils.SegmentedRegression fit = + PiecewiseLinearUtils.compressSegmentedLeastSquares(column, cs); + + AColGroup singleGroup = ColGroupPiecewiseLinearCompressed.create( + singleColIndex, + fit.getBreakpoints(), + fit.getSlopes(), + fit.getIntercepts(), + numRows); + + // Combine multiple columns + if (result == null) { + result = singleGroup; + } else { + result = result.combineWithSameIndex(numRows, col, singleGroup); } } - List segmentLimits = new ArrayList<>(); - int breakpointIndex = sizeColumn; - while(breakpointIndex > 0) { - segmentLimits.add(breakpointIndex); - breakpointIndex = prev[breakpointIndex]; - } - segmentLimits.add(0); - Collections.sort(segmentLimits); - return segmentLimits; - } - - public static double computeSegmentCost(double[] column, int start, int end) { - int n = end - start; - if(n <= 1) - return 0.0; - - double[] ab = regressSegment(column, start, end); - double slope = ab[0]; - double intercept = ab[1]; - - double sse = 0.0; - for(int i = start; i < end; i++) { - double x = i; - double y = column[i]; - double yhat = slope * x + intercept; - double diff = y - yhat; - sse += diff * diff; - } - return sse; // oder sse / n als MSE - } - - public static double computeTotalSSE(double[] column, List breaks) { - double total = 0.0; - for(int s = 0; s < breaks.size() - 1; s++) { - int start = breaks.get(s); - int end = breaks.get(s + 1); - total += computeSegmentCost(column, start, end); // SSE des Segments - } - return total; - } - - public static double[] regressSegment(double[] column, int start, int end) { - int n = end - start; - if(n <= 0) - return new double[] {0.0, 0.0}; - - double sumX = 0, sumY = 0, sumXX = 0, sumXY = 0; - for(int i = start; i < end; i++) { - double x = i; - double y = column[i]; - sumX += x; - sumY += y; - sumXX += x * x; - sumXY += x * y; - } - - double nD = n; - double denom = nD * sumXX - sumX * sumX; - double slope, intercept; - if(denom == 0) { - slope = 0.0; - intercept = sumY / nD; - } - else { - slope = (nD * sumXY - sumX * sumY) / denom; - intercept = (sumY - slope * sumX) / nD; - } - return new double[] {slope, intercept}; + return result; } private AColGroup compressSDCFromSparseTransposedBlock(IColIndex cols, int nrUniqueEstimate, double tupleSparsity) { From dfe2eee4665ae735b47f37a041433fec709dcbb3 Mon Sep 17 00:00:00 2001 From: mori49 <84979219+mori49@users.noreply.github.com> Date: Thu, 5 Feb 2026 22:52:16 +0100 Subject: [PATCH 19/35] add: utils, methods to calculate piecewiseLinearCompression --- .../functional/PiecewiseLinearUtils.java | 253 ++++++++++++++++++ 1 file changed, 253 insertions(+) create mode 100644 src/main/java/org/apache/sysds/runtime/compress/colgroup/functional/PiecewiseLinearUtils.java diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/functional/PiecewiseLinearUtils.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/functional/PiecewiseLinearUtils.java new file mode 100644 index 00000000000..7005be9de65 --- /dev/null +++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/functional/PiecewiseLinearUtils.java @@ -0,0 +1,253 @@ +package org.apache.sysds.runtime.compress.colgroup.functional; + +import org.apache.sysds.runtime.compress.CompressionSettings; +import org.apache.sysds.runtime.matrix.data.MatrixBlock; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; + +public class PiecewiseLinearUtils { + + private PiecewiseLinearUtils() { + + } + + public static final class SegmentedRegression { + private final int[] breakpoints; + private final double[] slopes; + private final double[] intercepts; + + public SegmentedRegression(int[] breakpoints, double[] slopes, double[] intercepts) { + this.breakpoints = breakpoints; + this.slopes = slopes; + this.intercepts = intercepts; + } + + public int[] getBreakpoints() { + return breakpoints; + } + + public double[] getSlopes() { + return slopes; + } + + public double[] getIntercepts() { + return intercepts; + } + } + + public static SegmentedRegression compressSegmentedLeastSquares(double[] column, CompressionSettings cs) { + //compute Breakpoints for a Column with dynamic Programming + final List breakpointsList = computeBreakpoints(cs, column); + final int[] breakpoints = breakpointsList.stream().mapToInt(Integer::intValue).toArray(); + + //get values for Regression + final int numSeg = breakpoints.length - 1; + final double[] slopes = new double[numSeg]; + final double[] intercepts = new double[numSeg]; + + // Regress per Segment + for (int seg = 0; seg < numSeg; seg++) { + final int SegStart = breakpoints[seg]; + final int SegEnd = breakpoints[seg + 1]; + + final double[] line = regressSegment(column, SegStart, SegEnd); + slopes[seg] = line[0]; //slope regession line + intercepts[seg] = line[1]; //intercept regression line + } + + return new SegmentedRegression(breakpoints, slopes, intercepts); + } + + public static SegmentedRegression compressSegmentedLeastSquaresV2(double[] column, CompressionSettings cs) { + //compute Breakpoints for a Column with Greedy Algorithm + + final List breakpointsList = computeBreakpointsGreedy(column, cs); + final int[] breakpoints = breakpointsList.stream().mapToInt(Integer::intValue).toArray(); + + //get values for Regression + final int numSeg = breakpoints.length - 1; + final double[] slopes = new double[numSeg]; + final double[] intercepts = new double[numSeg]; + + // Regress per Segment + for (int seg = 0; seg < numSeg; seg++) { + final int segstart = breakpoints[seg]; + final int segEnd = breakpoints[seg + 1]; + final double[] line = regressSegment(column, segstart, segEnd); + slopes[seg] = line[0]; + intercepts[seg] = line[1]; + } + return new SegmentedRegression(breakpoints,slopes, intercepts); + } + + public static double[] getColumn(MatrixBlock in, int colIndex) { + final int numRows = in.getNumRows(); + final double[] column = new double[numRows]; + + for (int row = 0; row < numRows; row++) { + column[row] = in.get(row, colIndex); + } + return column; + } + + public static List computeBreakpoints(CompressionSettings cs, double[] column) { + final int numElements = column.length; + final double targetMSE = cs.getPiecewiseTargetLoss(); + + + // TODO: Maybe remove Fallback if no targetloss is given + /*if (Double.isNaN(targetMSE) || targetMSE <= 0) { + final double segmentPenalty = 2.0 * Math.log(numElements); + return computeBreakpointsLambda(column, segmentPenalty); + }*/ + + // max targetloss + final double sseMax = numElements * targetMSE; + double minLoss = 0.0; + double maxLoss = numElements * 100.0; + List bestBreaks = null; + //compute breakpoints + while(maxLoss -minLoss > 1e-8) { + final double currentLoss = 0.5 * (minLoss + maxLoss); + final List breaks = computeBreakpointsLambda(column, currentLoss); + final double totalSSE = computeTotalSSE(column, breaks); + if (totalSSE <= sseMax) { + bestBreaks = breaks; + minLoss = currentLoss; + } + else { + maxLoss = currentLoss; + } + } + + if (bestBreaks == null) + bestBreaks = computeBreakpointsLambda(column, minLoss); + + return bestBreaks; + } + + public static List computeBreakpointsLambda(double[] column, double lambda) { + final int numrows = column.length; + final double[] costs = new double[numrows + 1]; //min Cost + final int[] prevStart = new int[numrows + 1]; //previous Start + costs[0] = 0.0; + // Find Cost + for (int rowEnd = 1; rowEnd <= numrows; rowEnd++) { + costs[rowEnd] = Double.POSITIVE_INFINITY; + //Test all possible Segment to find the lowest costs + for (int rowStart = 0; rowStart < rowEnd; rowStart++) { + //costs = current costs + segmentloss + penaltiy + final double costCurrentSegment = computeSegmentCost(column, rowStart, rowEnd); + final double totalCost = costs[rowStart] + costCurrentSegment + lambda; + // Check if it is the better solution + if (totalCost < costs[rowEnd]) { + costs[rowEnd] = totalCost; + prevStart[rowEnd] = rowStart; + } + } + } + //Check the optimal segmentlimits + final List segmentLimits = new ArrayList<>(); + int breakpointIndex = numrows; + while (breakpointIndex > 0) { + segmentLimits.add(breakpointIndex); + breakpointIndex = prevStart[breakpointIndex]; + } + segmentLimits.add(0); + Collections.sort(segmentLimits); + return segmentLimits; + } + + public static double computeSegmentCost(double[] column, int start, int end) { + final int segSize = end - start; + if (segSize <= 1) + return 0.0; + + final double[] ab = regressSegment(column, start, end); //Regressionline + final double slope = ab[0]; + final double intercept = ab[1]; + + double sumSquaredError = 0.0; + for (int i = start; i < end; i++) { + final double rowIdx = i; + final double actualValue = column[i]; + final double predictedValue = slope * rowIdx + intercept; + final double difference = actualValue - predictedValue; + sumSquaredError += difference * difference; + } + return sumSquaredError; + } + + public static double computeTotalSSE(double[] column, List breaks) { + double total = 0.0; + for (int s = 0; s < breaks.size() - 1; s++) { + final int start = breaks.get(s); + final int end = breaks.get(s + 1); + total += computeSegmentCost(column, start, end); + } + return total; + } + + public static double[] regressSegment(double[] column, int start, int end) { + final int numElements = end - start; + if (numElements <= 0) + return new double[] {0.0, 0.0}; + + double sumOfRowIndices = 0, sumOfColumnValues = 0, sumOfRowIndicesSquared = 0, productRowIndexTimesColumnValue = 0; + for (int i = start; i < end; i++) { + final double x = i; + final double y = column[i]; + sumOfRowIndices += x; + sumOfColumnValues += y; + sumOfRowIndicesSquared += x * x; + productRowIndexTimesColumnValue += x * y; + } + + final double numPointsInSegmentDouble = numElements; + final double denominatorForSlope = numPointsInSegmentDouble * sumOfRowIndicesSquared - sumOfRowIndices * sumOfRowIndices; + final double slope; + final double intercept; + if (denominatorForSlope == 0) { + slope = 0.0; + intercept = sumOfColumnValues / numPointsInSegmentDouble; + } + else { + slope = (numPointsInSegmentDouble * productRowIndexTimesColumnValue - sumOfRowIndices * sumOfColumnValues) / denominatorForSlope; + intercept = (sumOfColumnValues - slope * sumOfRowIndices) / numPointsInSegmentDouble; + } + return new double[] {slope, intercept}; + } + public static List computeBreakpointsGreedy(double[] column, CompressionSettings cs) { + final int numElements = column.length; + final double targetMSE = cs.getPiecewiseTargetLoss(); + if (Double.isNaN(targetMSE) || targetMSE <= 0) { + return Arrays.asList(0, numElements); // Fallback: ein Segment + } + + List breakpoints = new ArrayList<>(); + breakpoints.add(0); + int currentStart = 0; + + while (currentStart < numElements) { + int bestEnd = numElements; // Default: Rest als Segment + for (int end = currentStart + 1; end <= numElements; end++) { + double sse = computeSegmentCost(column, currentStart, end); + double sseMax = (end - currentStart) * targetMSE; + if (sse > sseMax) { + bestEnd = end - 1; // Letzter gültiger Endpunkt + break; + } + } + breakpoints.add(bestEnd); + currentStart = bestEnd; + } + + if (breakpoints.get(breakpoints.size() - 1) != numElements) { + breakpoints.add(numElements); + } + return breakpoints; + } +} From 9e0d18b3c198d4d3e252a4ca7970da9c217fac82 Mon Sep 17 00:00:00 2001 From: mori49 <84979219+mori49@users.noreply.github.com> Date: Thu, 5 Feb 2026 23:58:48 +0100 Subject: [PATCH 20/35] wip: clear up tests add: test with randomly generated Data --- ...ColGroupPiecewiseLinearCompressedTest.java | 234 +++++++++++------- 1 file changed, 144 insertions(+), 90 deletions(-) rename src/test/java/org/apache/sysds/{runtime => test/component}/compress/colgroup/ColGroupPiecewiseLinearCompressedTest.java (76%) diff --git a/src/test/java/org/apache/sysds/runtime/compress/colgroup/ColGroupPiecewiseLinearCompressedTest.java b/src/test/java/org/apache/sysds/test/component/compress/colgroup/ColGroupPiecewiseLinearCompressedTest.java similarity index 76% rename from src/test/java/org/apache/sysds/runtime/compress/colgroup/ColGroupPiecewiseLinearCompressedTest.java rename to src/test/java/org/apache/sysds/test/component/compress/colgroup/ColGroupPiecewiseLinearCompressedTest.java index 4f309fda967..fa1f88fab98 100644 --- a/src/test/java/org/apache/sysds/runtime/compress/colgroup/ColGroupPiecewiseLinearCompressedTest.java +++ b/src/test/java/org/apache/sysds/test/component/compress/colgroup/ColGroupPiecewiseLinearCompressedTest.java @@ -1,27 +1,37 @@ -package org.apache.sysds.runtime.compress.colgroup; +package org.apache.sysds.test.component.compress.colgroup; import org.apache.sysds.runtime.compress.CompressionSettings; import org.apache.sysds.runtime.compress.CompressionSettingsBuilder; +import org.apache.sysds.runtime.compress.colgroup.AColGroup; +import org.apache.sysds.runtime.compress.colgroup.ColGroupFactory; +import org.apache.sysds.runtime.compress.colgroup.ColGroupPiecewiseLinearCompressed; +import org.apache.sysds.runtime.compress.colgroup.functional.PiecewiseLinearUtils; import org.apache.sysds.runtime.compress.colgroup.indexes.ColIndexFactory; import org.apache.sysds.runtime.compress.colgroup.indexes.IColIndex; -import org.apache.sysds.runtime.compress.colgroup.scheme.ColGroupPiecewiseLinearCompressed; import org.apache.sysds.runtime.compress.estim.CompressedSizeInfo; import org.apache.sysds.runtime.compress.estim.CompressedSizeInfoColGroup; import org.apache.sysds.runtime.compress.estim.EstimationFactors; import org.apache.sysds.runtime.data.DenseBlock; import org.apache.sysds.runtime.matrix.data.MatrixBlock; +import org.apache.sysds.runtime.util.DataConverter; +import org.apache.sysds.test.AutomatedTestBase; import org.junit.Test; import java.util.Arrays; import java.util.List; -import static org.apache.sysds.runtime.compress.colgroup.ColGroupFactory.*; +import static org.apache.sysds.runtime.compress.colgroup.functional.PiecewiseLinearUtils.*; +import static org.apache.sysds.test.functions.io.binary.BlocksizeTest.sparsity; import static org.junit.Assert.*; -public class ColGroupPiecewiseLinearCompressedTest { +public class ColGroupPiecewiseLinearCompressedTest extends AutomatedTestBase { + @Override + public void setUp() { + + } @Test - public void testComputeBreakpoints_uniformColumn() { + public void testComputeBreakpointsUniformColumn() { CompressionSettings cs = new CompressionSettingsBuilder().create(); cs.setPiecewiseTargetLoss(1e-3); double[] column = {1.0, 1.0, 1.0, 1.0, 1.0}; // ← Test-spezifisch @@ -30,7 +40,7 @@ public void testComputeBreakpoints_uniformColumn() { } @Test - public void testComputeBreakpoints_linearIncreasing() { + public void testComputeBreakpointsLinearIncreasing() { CompressionSettings cs = new CompressionSettingsBuilder().create(); cs.setPiecewiseTargetLoss(1e-3); double[] column = {0.0, 1.0, 2.0, 3.0, 4.0}; // ← andere column @@ -39,17 +49,10 @@ public void testComputeBreakpoints_linearIncreasing() { } - @Test - public void testComputeBreakpoints_highLoss_uniform() { - CompressionSettings cs = new CompressionSettingsBuilder().create(); - cs.setPiecewiseTargetLoss(10000.0); - double[] column = {1.0, 1.0, 1.0, 1.0, 1.0}; - List breaks = computeBreakpoints(cs, column); - assertEquals(Arrays.asList(0, 5), breaks); - } + @Test - public void testComputeBreakpoints_twoSegments() { + public void testComputeBreakpointsTwoSegments() { CompressionSettings cs = new CompressionSettingsBuilder().create(); cs.setPiecewiseTargetLoss(1e-3); // {1,1,1, 2,2,2} → 2 Segmente → [0,3,6] @@ -58,18 +61,10 @@ public void testComputeBreakpoints_twoSegments() { assertEquals(Arrays.asList(0, 3, 6), breaks); } - @Test - public void testComputeBreakpoints_noLoss_linear() { - CompressionSettings cs = new CompressionSettingsBuilder().create(); - cs.setPiecewiseTargetLoss(0.0); - //cs.setPiecewiseTargetLoss(0.0); - double[] column = {0.0, 1.0, 2.0, 3.0, 4.0}; - List breaks = computeBreakpoints(cs, column); - assertEquals(Arrays.asList(0, 5), breaks); // bei 0 Loss alle Breaks - } + @Test - public void testComputeBreakpointsLambda_const() { + public void testComputeBreakpointsLambdaConst() { double[] column = {1.0, 1.0, 1.0, 1.0, 1.0}; List breaks = computeBreakpointsLambda(column, 5.0); assertEquals(Arrays.asList(0, 5), breaks); @@ -79,7 +74,7 @@ public void testComputeBreakpointsLambda_const() { } @Test - public void testComputeBreakpointsLambda_twoSegments() { + public void testComputeBreakpointsLambdaTwoSegments() { double[] column = {1.0, 1.0, 1.0, 2.0, 2.0, 2.0}; // 6 Werte // mit kleinem lambda -> viele Segmente (kostenlos fast) @@ -94,7 +89,7 @@ public void testComputeBreakpointsLambda_twoSegments() { } @Test - public void testComputeBreakpointsLambda_jumpWithTrend() { + public void testComputeBreakpointsLambdaJumpWithTrend() { double[] column = {0.0, 1.0, 2.0, 10.0, 11.0, 12.0}; // grobe Segmentanpassung: ein Segment pro „Abschnitt“ @@ -107,7 +102,7 @@ public void testComputeBreakpointsLambda_jumpWithTrend() { } @Test - public void testComputeBreakpointsLambda_linear() { + public void testComputeBreakpointsLambdaLinear() { double[] column = {0.0, 1.0, 2.0, 3.0, 4.0, 5.0}; List breaks = computeBreakpointsLambda(column, 1.0); @@ -121,7 +116,7 @@ public void testComputeBreakpointsLambda_linear() { } @Test - public void testComputeBreakpointsLambda_edge_lambdaVerySmall() { + public void testComputeBreakpointsLambdaEdgeLambdaVerySmall() { double[] column = {1.0, 1.1, 1.0, 1.1, 1.0}; List breaks = computeBreakpointsLambda(column, 0.001); @@ -137,7 +132,7 @@ public void testComputeBreakpointsLambda_edge_lambdaVerySmall() { } @Test - public void testComputeBreakpointsLambda_edge_lambdaVeryLarge() { + public void testComputeBreakpointsLambdaEdgeLambdaVeryLarge() { double[] column = {1.0, 2.0, 1.5, 2.5, 1.8}; List breaks = computeBreakpointsLambda(column, 1000.0); @@ -145,7 +140,7 @@ public void testComputeBreakpointsLambda_edge_lambdaVeryLarge() { } @Test - public void testComputeSegmentCost_emptyOrSingle() { + public void testComputeSegmentCostEmptyOrSingle() { double[] column = {10.0, 20.0, 30.0}; // 0 Elemente (leer) @@ -159,7 +154,7 @@ public void testComputeSegmentCost_emptyOrSingle() { } @Test - public void testComputeSegmentCost_twoConstantPoints() { + public void testComputeSegmentCostTwoConstantPoints() { double[] column = {5.0, 5.0, 1.0, 1.0}; // Zwei identische Punkte (konstant) → SSE = 0 @@ -168,7 +163,7 @@ public void testComputeSegmentCost_twoConstantPoints() { } @Test - public void testComputeSegmentCost_twoDifferentPoints() { + public void testComputeSegmentCostTwoDifferentPoints() { double[] column = {0.0, 2.0, 1.0, 3.0}; // Zwei Punkte: (0,0) und (1,2) → Gerade y = 2*x, Fehler = 0 @@ -181,14 +176,14 @@ public void testComputeSegmentCost_twoDifferentPoints() { } @Test - public void testComputeSegmentCost_constantThree() { + public void testComputeSegmentCostConstantThree() { double[] column = {0.0, 0.0, 0.0}; double sse = computeSegmentCost(column, 0, 3); assertEquals(0.0, sse, 1e-10); } @Test - public void testComputeSegmentCost_consistent_with_regression() { + public void testComputeSegmentCostConsistentWithRegression() { double[] column = {0.0, 2.0, 0.0, 4.0, 0.0, 6.0}; int start = 0, end = 3; @@ -205,30 +200,9 @@ public void testComputeSegmentCost_consistent_with_regression() { assertEquals(sse_hand, sse, 1e-10); } - @Test - public void testComputeTotalSSE_emptyBreaks() { - double[] column = {1.0, 2.0, 3.0}; - List breaks = Arrays.asList(); // leer → keine Segmente - double total = computeTotalSSE(column, breaks); - - // 0 Segmente → Summe über 0 Segmente = 0 - assertEquals(0.0, total, 1e-10); - } @Test - public void testComputeTotalSSE_singleSegment_all() { - double[] column = {1.0, 2.0, 3.0}; - List breaks = Arrays.asList(0, 3); // ein Segment [0,3) - - double total = computeTotalSSE(column, breaks); - double expected = computeSegmentCost(column, 0, 3); - - // Ergebnis muss exakt das gleiche wie der SSE des gesamten Segments sein - assertEquals(expected, total, 1e-10); - } - - @Test - public void testComputeTotalSSE_twoSegments() { + public void testComputeTotalSSETwoSegments() { // Beispiel: [0,0,0] und [1,1,1] (jeweils konstant) double[] column = {0.0, 0.0, 0.0, 1.0, 1.0, 1.0}; List breaks = Arrays.asList(0, 3, 6); // zwei Segmente @@ -243,7 +217,7 @@ public void testComputeTotalSSE_twoSegments() { } @Test - public void testComputeTotalSSE_threeSegments() { + public void testComputeTotalSSEThreeSegments() { // Ein Segment mit drei identischen Werten, zwei Segmente mit jeweils zwei Werten double[] column = {1.0, 1.0, 1.0, 2.0, 2.0, 3.0, 3.0}; List breaks = Arrays.asList(0, 3, 5, 7); @@ -263,7 +237,7 @@ public void testComputeTotalSSE_threeSegments() { } @Test - public void testComputeTotalSSE_gapStartEnd() { + public void testComputeTotalSSEGapStartEnd() { double[] column = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0}; List breaks = Arrays.asList(2, 5, 8); @@ -276,7 +250,7 @@ public void testComputeTotalSSE_gapStartEnd() { } @Test - public void testComputeTotalSSE_oneSegment_identical() { + public void testComputeTotalSSEOneSegmentIdentical() { double[] column = {1.0, 2.0, 3.0, 4.0, 5.0}; double sseTotal = computeSegmentCost(column, 0, 5); @@ -287,7 +261,7 @@ public void testComputeTotalSSE_oneSegment_identical() { } @Test - public void testComputeTotalSSE_nonConstant() { + public void testComputeTotalSSENonConstant() { double[] column = {0.0, 1.0, 2.0, 3.0, 4.0}; List breaks = Arrays.asList(0, 2, 5); @@ -300,7 +274,7 @@ public void testComputeTotalSSE_nonConstant() { } @Test - public void testComputeTotalSSE_edgeCases() { + public void testComputeTotalSSEEdgeCases() { double[] columnEmpty = {}; List breaksEmpty = Arrays.asList(0, 0); assertEquals(0.0, computeTotalSSE(columnEmpty, breaksEmpty), 1e-10); @@ -312,7 +286,7 @@ public void testComputeTotalSSE_edgeCases() { } @Test - public void testRegressSegment_empty() { + public void testRegressSegmentEmpty() { double[] column = {1.0, 2.0, 3.0}; double[] result = regressSegment(column, 0, 0); assertEquals(0.0, result[0], 1e-10); @@ -320,7 +294,7 @@ public void testRegressSegment_empty() { } @Test - public void testRegressSegment_singlePoint() { + public void testRegressSegmentSinglePoint() { double[] column = {1.0, 2.0, 3.0}; double[] result = regressSegment(column, 1, 2); @@ -329,7 +303,7 @@ public void testRegressSegment_singlePoint() { } @Test - public void testRegressSegment_twoIdentical() { + public void testRegressSegmentTwoIdentical() { double[] column = {5.0, 5.0, 1.0, 1.0}; double[] result = regressSegment(column, 0, 2); @@ -338,7 +312,7 @@ public void testRegressSegment_twoIdentical() { } @Test - public void testRegressSegment_twoPoints() { + public void testRegressSegmentTwoPoints() { double[] column = {0.0, 2.0}; double[] result = regressSegment(column, 0, 2); @@ -347,7 +321,7 @@ public void testRegressSegment_twoPoints() { } @Test - public void testRegressSegment_twoPoints_offset() { + public void testRegressSegmentTwoPointsOffset() { double[] column = {1.0, 3.0, 5.0, 7.0}; double[] result = regressSegment(column, 2, 4); @@ -357,7 +331,7 @@ public void testRegressSegment_twoPoints_offset() { } @Test - public void testRegressSegment_constant() { + public void testRegressSegmentConstant() { double[] column = {3.0, 3.0, 3.0, 3.0}; double[] result = regressSegment(column, 0, 4); @@ -366,7 +340,7 @@ public void testRegressSegment_constant() { } @Test - public void testRegressSegment_linear() { + public void testRegressSegmentLinear() { double[] column = new double[4]; double a = 1.5, b = 2.0; for(int i = 0; i < 4; i++) { @@ -379,17 +353,10 @@ public void testRegressSegment_linear() { assertEquals(b, result[1], 1e-10); } - @Test - public void testRegressSegment_denomZero() { - double[] column = {10.0}; - double[] result = regressSegment(column, 0, 1); - assertEquals(0.0, result[0], 1e-10); - assertEquals(10.0, result[1], 1e-10); - } @Test - public void testCompressPiecewiseLinearFunctional_const() { + public void testCompressPiecewiseLinearFunctionalConst() { // 1. MatrixBlock mit einer konstanten Spalte erzeugen int nrows = 20, ncols = 1; MatrixBlock in = new MatrixBlock(nrows, ncols, false); @@ -428,35 +395,35 @@ public void testCompressPiecewiseLinearFunctional_const() { } @Test(expected = IllegalArgumentException.class) - public void testCreate_nullBreakpoints() { + public void testCreateNullBreakpoints() { int[] nullBp = null; ColGroupPiecewiseLinearCompressed.create(ColIndexFactory.create(new int[] {0}), nullBp, new double[] {1.0}, new double[] {0.0}, 10); } @Test(expected = IllegalArgumentException.class) - public void testCreate_tooFewBreakpoints() { + public void testCreateTooFewBreakpoints() { int[] singleBp = {0}; ColGroupPiecewiseLinearCompressed.create(ColIndexFactory.create(new int[] {0}), singleBp, new double[] {1.0}, new double[] {0.0}, 10); } @Test(expected = IllegalArgumentException.class) - public void testCreate_inconsistentSlopes() { + public void testCreateInconsistentSlopes() { int[] bp = {0, 5, 10}; ColGroupPiecewiseLinearCompressed.create(ColIndexFactory.create(new int[] {0}), bp, new double[] {1.0, 2.0, 3.0}, new double[] {0.0, 1.0}, 10); } @Test(expected = IllegalArgumentException.class) - public void testCreate_inconsistentIntercepts() { + public void testCreateInconsistentIntercepts() { int[] bp = {0, 5, 10}; ColGroupPiecewiseLinearCompressed.create(ColIndexFactory.create(new int[] {0}), bp, new double[] {1.0, 2.0}, new double[] {0.0}, 10); } @Test - public void testCreate_validMultiSegment() { + public void testCreateValidMultiSegment() { int[] bp = {0, 3, 7, 10}; double[] slopes = {1.0, -2.0, 0.5}; double[] intercepts = {0.0, 5.0, -1.0}; @@ -469,7 +436,7 @@ public void testCreate_validMultiSegment() { } @Test - public void testCreate_multiColumn() { + public void testCreateMultiColumn() { IColIndex cols = ColIndexFactory.create(new int[] {5, 10, 15}); int[] bp = {0, 5}; double[] slopes = {3.0}; @@ -500,7 +467,7 @@ public void testCreate_multiColumn() { } @Test - public void testCreate_singleColumn() { + public void testCreateSingleColumn() { IColIndex cols = ColIndexFactory.create(new int[] {5}); int[] bp = {0, 5}; double[] slopes = {3.0}; @@ -516,7 +483,7 @@ public void testCreate_singleColumn() { } @Test - public void testCreate_validMinimal() { + public void testCreateValidMinimal() { // 1 Segment: [0,10] → y = 2.0 * r + 1.0 int[] bp = {0, 10}; @@ -586,7 +553,7 @@ private ColGroupPiecewiseLinearCompressed createTestGroup(int numRows) { } @Test - public void testDecompressToDenseBlock_fullRange() { + public void testDecompressToDenseBlockFullRange() { ColGroupPiecewiseLinearCompressed cg = createTestGroup(12); MatrixBlock target = new MatrixBlock(12, 1, false); @@ -606,7 +573,7 @@ public void testDecompressToDenseBlock_fullRange() { } @Test - public void testDecompressToDenseBlock_partialRange() { + public void testDecompressToDenseBlockPartialRange() { ColGroupPiecewiseLinearCompressed cg = createTestGroup(12); MatrixBlock target = new MatrixBlock(12, 1, false); @@ -625,7 +592,7 @@ public void testDecompressToDenseBlock_partialRange() { } @Test - public void testDecompressToDenseBlock_emptyRange() { + public void testDecompressToDenseBlockEmptyRange() { ColGroupPiecewiseLinearCompressed cg = createTestGroup(12); MatrixBlock target = new MatrixBlock(5, 1, false); @@ -643,7 +610,7 @@ public void testDecompressToDenseBlock_emptyRange() { } @Test - public void testDecompressToDenseBlock_nullSafety() { + public void testDecompressToDenseBlockNullSafety() { ColGroupPiecewiseLinearCompressed cg = createTestGroup(10); // Null DenseBlock @@ -677,7 +644,7 @@ private CompressedSizeInfo createTestCompressedSizeInfo() { } @Test - public void testCompressPiecewiseLinear_viaRealAPI() { + public void testCompressPiecewiseLinearViaRealAPI() { MatrixBlock in = new MatrixBlock(10, 1, false); in.allocateDenseBlock(); @@ -695,5 +662,92 @@ public void testCompressPiecewiseLinear_viaRealAPI() { boolean hasPiecewise = colGroups.stream().anyMatch(cg -> cg instanceof ColGroupPiecewiseLinearCompressed); assertTrue(hasPiecewise); } + @Test + + public void testGreedy_linearColumn_singleSegment() { + // 2. Perfekte Gerade → 1 Segment + double[] linearCol = {1.0, 2.0, 3.0, 4.0, 5.0}; // y=x+1 + CompressionSettings cs = new CompressionSettingsBuilder().create(); + cs.setPiecewiseTargetLoss(1e-6); + + List breaks = PiecewiseLinearUtils.computeBreakpointsGreedy(linearCol, cs); + assertEquals("[0, 5]", breaks.toString()); // SSE=0 ✓ + } + + @Test + public void testGreedy_noisyColumn_multipleSegments() { + // 3. Mit Sprung → 2 Segmente + double[] noisyCol = {1.1, 1.9, 2.2, 10.1, 10.8, 11.3}; // Sprung bei 3 + CompressionSettings cs = new CompressionSettingsBuilder().create(); + cs.setPiecewiseTargetLoss(1.0); // Erlaubt MSE=1 + + List breaks = PiecewiseLinearUtils.computeBreakpointsGreedy(noisyCol, cs); + // Erwartet mind. 2 Segmente (Sprung erkennen) + assertTrue(breaks.size() >= 3); // [0, ?, 6] + } + + @Test + public void testGreedy_targetLossIncreasesSegments() { + // 4. Höherer Target-Loss → weniger Segmente + double[] colWithJumps = {1,2,3, 10,11,12, 20,21,22}; + CompressionSettings csStrict = new CompressionSettingsBuilder().create(); + csStrict.setPiecewiseTargetLoss(0.01); // Streng → viele Segmente + + CompressionSettings csLoose = new CompressionSettingsBuilder().create(); + csLoose.setPiecewiseTargetLoss(10.0); + + List strictBreaks = PiecewiseLinearUtils.computeBreakpointsGreedy(colWithJumps, csStrict); + List looseBreaks = PiecewiseLinearUtils.computeBreakpointsGreedy(colWithJumps, csLoose); + + // Strenger Target → mehr Segmente + assertTrue(strictBreaks.size() > looseBreaks.size()); + } + + + @Test + public void testMultiColumnTargetLossRespected() { + final int rows = 50, cols = 2; + double[][] data = getRandomMatrix(rows, cols, 0, 10, 1.0, 42L); + MatrixBlock orig = DataConverter.convertToMatrixBlock(data); + orig.allocateDenseBlock(); + + IColIndex colIdx = ColIndexFactory.create(0, cols-1); + CompressionSettings cs = new CompressionSettingsBuilder().create(); + cs.setPiecewiseTargetLoss(1.0); + + AColGroup cg = ColGroupFactory.compressPiecewiseLinearFunctional(colIdx, orig, cs); + + MatrixBlock target = new MatrixBlock(rows, cols, false); + target.allocateDenseBlock(); + cg.decompressToDenseBlock(target.getDenseBlock(), 0, rows-1, 0, cols-1); + + // Test MSE für jede Spalte + for (int c = 0; c < cols; c++) { + double mse = computeColumnMSE(orig, target, c); + assertTrue("Col " + c + " MSE=" + mse + " > target=1.0", mse <= 1.0); + } + } + + + private double computeColumnMSE(MatrixBlock orig, MatrixBlock reconstructed, int colIdx) { + double mse = 0.0; + final int numRows = orig.getNumRows(); + + DenseBlock origDb = orig.getDenseBlock(); + DenseBlock reconDb = reconstructed.getDenseBlock(); + + for (int row = 0; row < numRows; row++) { + final double origValue = origDb.get(row, colIdx); // ← DENSEBLOCK.GET! + final double reconValue = reconDb.get(row, colIdx); + final double squaredError = (origValue - reconValue) * (origValue - reconValue); + mse += squaredError; + } + + return mse / numRows; + } + + + + } From abeced44c770ba6b4f67d182feec958a6720426d Mon Sep 17 00:00:00 2001 From: mori49 <84979219+mori49@users.noreply.github.com> Date: Fri, 6 Feb 2026 00:00:29 +0100 Subject: [PATCH 21/35] fix: revert pom.xml --- pom.xml | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/pom.xml b/pom.xml index eba29562841..08669868aa1 100644 --- a/pom.xml +++ b/pom.xml @@ -1577,11 +1577,5 @@ fastdoubleparser 0.9.0 - - org.junit.jupiter - junit-jupiter - RELEASE - test - - + From fc528aef1eed0d2bc36dbb3f8291e2d062a3204c Mon Sep 17 00:00:00 2001 From: mori49 <84979219+mori49@users.noreply.github.com> Date: Mon, 16 Feb 2026 11:37:09 +0100 Subject: [PATCH 22/35] rm files --- bin/systemds-standalone.sh | 12 -------- use-java17-systemds.sh | 57 -------------------------------------- 2 files changed, 69 deletions(-) delete mode 100755 bin/systemds-standalone.sh delete mode 100755 use-java17-systemds.sh diff --git a/bin/systemds-standalone.sh b/bin/systemds-standalone.sh deleted file mode 100755 index 9efaa963a4b..00000000000 --- a/bin/systemds-standalone.sh +++ /dev/null @@ -1,12 +0,0 @@ -#!/bin/bash -# Standalone-Launcher für SystemDS - -SCRIPT_DIR=$(cd "$(dirname "$0")" && pwd) -JAR_FILE="$SCRIPT_DIR/../target/systemds-3.4.0-SNAPSHOT.jar" - -if [ ! -f "$JAR_FILE" ]; then - echo "ERROR: Standalone JAR nicht gefunden: $JAR_FILE" - exit 1 -fi - -java -cp "$JAR_FILE" org.apache.sysds.api.DMLScript "$@" diff --git a/use-java17-systemds.sh b/use-java17-systemds.sh deleted file mode 100755 index 0c1a2fda871..00000000000 --- a/use-java17-systemds.sh +++ /dev/null @@ -1,57 +0,0 @@ -#!/bin/bash -# ------------------------------------------------------------------ -# SystemDS macOS Build-Skript -# Setzt JAVA_HOME, PATH, Maven und erzeugt systemds-standalone.sh -# ------------------------------------------------------------------ - -# 1️⃣ Setze Java 17 -export JAVA_HOME=$(/usr/libexec/java_home -v 17) -export PATH="$JAVA_HOME/bin:/usr/bin:/bin:/usr/sbin:/sbin:/usr/local/bin:/opt/homebrew/bin:/opt/homebrew/sbin:$PATH" - -# 2️⃣ Optional: Python, ghcup, uix/Deno, Coursier, JetBrains Toolbox -export PATH="/Library/Frameworks/Python.framework/Versions/3.11/bin:$HOME/.ghcup/bin:$HOME/.uix/bin:$PATH" -export DENO_INSTALL="$HOME/.uix" -export PATH="$DENO_INSTALL/bin:$PATH" -export PATH="$PATH:/Users/mori/Library/Application Support/Coursier/bin" -export PATH="$PATH:/Users/mori/Library/Application Support/JetBrains/Toolbox/scripts" - -# 3️⃣ Prüfen, ob Maven existiert -if ! command -v mvn >/dev/null 2>&1; then - echo "ERROR: Maven (mvn) nicht gefunden. Bitte installieren!" - exit 1 -fi - -# 4️⃣ Prüfen, ob wir im Projekt-Root sind (pom.xml vorhanden) -if [ ! -f "pom.xml" ]; then - echo "ERROR: pom.xml nicht gefunden. Bitte ins SystemDS-Projekt-Root wechseln." - exit 1 -fi - -# 5️⃣ Maven Build ausführen -echo "📦 Starte Maven Build..." -mvn clean package -DskipTests - -# 6️⃣ Standalone-Skript erzeugen -echo "🔧 Erzeuge bin/systemds-standalone.sh..." - -mkdir -p bin -cat > bin/systemds-standalone.sh << 'EOF' -#!/bin/bash -# Standalone-Launcher für SystemDS - -SCRIPT_DIR=$(cd "$(dirname "$0")" && pwd) -JAR_FILE="$SCRIPT_DIR/../target/systemds-3.4.0-SNAPSHOT.jar" - -if [ ! -f "$JAR_FILE" ]; then - echo "ERROR: Standalone JAR nicht gefunden: $JAR_FILE" - exit 1 -fi - -java -cp "$JAR_FILE" org.apache.sysds.api.DMLScript "$@" -EOF - -# 7️⃣ Ausführbar machen -chmod +x bin/systemds-standalone.sh - -echo "✅ Fertig! Standalone-Skript erstellt: bin/systemds-standalone.sh" - From 7b920c1a5bca26bf65d4b44ffe45476dd71f9c89 Mon Sep 17 00:00:00 2001 From: mori49 <84979219+mori49@users.noreply.github.com> Date: Tue, 3 Mar 2026 10:42:19 +0100 Subject: [PATCH 23/35] add: CompressionType and ColGroupType PiecewiseLinear --- .../runtime/compress/colgroup/AColGroup.java | 236 +++++++++--------- 1 file changed, 120 insertions(+), 116 deletions(-) diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/AColGroup.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/AColGroup.java index e2bf69f5c15..07382ed932b 100644 --- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/AColGroup.java +++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/AColGroup.java @@ -55,7 +55,7 @@ /** * Abstract Class that is the lowest class type for the Compression framework. - * + * * AColGroup store information about a number of columns. * */ @@ -65,7 +65,8 @@ public abstract class AColGroup implements Serializable { /** Public super types of compression ColGroups supported */ public static enum CompressionType { - UNCOMPRESSED, RLE, OLE, DDC, CONST, EMPTY, SDC, SDCFOR, DDCFOR, DeltaDDC, LinearFunctional, PiecewiseLinear; + UNCOMPRESSED, RLE, OLE, DDC, CONST, EMPTY, SDC, SDCFOR, DDCFOR, DeltaDDC, LinearFunctional, PiecewiseLinear, + PiecewiseLinearSukzessive; public boolean isDense() { return this == DDC || this == CONST || this == DDCFOR || this == DDCFOR; @@ -82,12 +83,12 @@ public boolean isSDC() { /** * Concrete ColGroupType - * + * * Protected such that outside the ColGroup package it should be unknown which specific subtype is used. */ protected static enum ColGroupType { UNCOMPRESSED, RLE, OLE, DDC, CONST, EMPTY, SDC, SDCSingle, SDCSingleZeros, SDCZeros, SDCFOR, DDCFOR, DeltaDDC, - LinearFunctional; + LinearFunctional, PiecewiseLinear; } /** The ColGroup indexes contained in the ColGroup */ @@ -95,7 +96,7 @@ protected static enum ColGroupType { /** * Main constructor. - * + * * @param colIndices offsets of the columns in the matrix block that make up the group */ protected AColGroup(IColIndex colIndices) { @@ -104,7 +105,7 @@ protected AColGroup(IColIndex colIndices) { /** * Obtain the offsets of the columns in the matrix block that make up the group - * + * * @return offsets of the columns in the matrix block that make up the group */ public final IColIndex getColIndices() { @@ -113,7 +114,7 @@ public final IColIndex getColIndices() { /** * Obtain the number of columns in this column group. - * + * * @return number of columns in this column group */ public final int getNumCols() { @@ -124,9 +125,9 @@ public final int getNumCols() { * Shift all column indexes contained by an offset. * * This is used for rbind to combine compressed matrices. - * + * * Since column indexes are reused between operations, we allocate a new list here to be safe - * + * * @param offset The offset to move all columns * @return A new column group object with the shifted columns */ @@ -138,7 +139,7 @@ public final AColGroup shiftColIndices(int offset) { * Copy the content of the column group with pointers to the previous content but with new column given Note this * method does not verify if the colIndexes specified are valid and correct dimensions for the underlying column * groups. - * + * * @param colIndexes the new indexes to use in the copy * @return a new object with pointers to underlying data. */ @@ -146,7 +147,7 @@ public final AColGroup shiftColIndices(int offset) { /** * Get the upper bound estimate of in memory allocation for the column group. - * + * * @return an upper bound on the number of bytes used to store this ColGroup in memory. */ public long estimateInMemorySize() { @@ -157,9 +158,9 @@ public long estimateInMemorySize() { /** * Decompress a range of rows into a sparse block - * + * * Note that this is using append, so the sparse column indexes need to be sorted afterwards. - * + * * @param sb Sparse Target block * @param rl Row to start at * @param ru Row to end at @@ -170,7 +171,7 @@ public final void decompressToSparseBlock(SparseBlock sb, int rl, int ru) { /** * Decompress a range of rows into a dense block - * + * * @param db Dense target block * @param rl Row to start at * @param ru Row to end at @@ -181,7 +182,7 @@ public final void decompressToDenseBlock(DenseBlock db, int rl, int ru) { /** * Decompress a range of rows into a dense transposed block. - * + * * @param db Dense target block * @param rl Row in this column group to start at. * @param ru Row in this column group to end at. @@ -191,7 +192,7 @@ public final void decompressToDenseBlock(DenseBlock db, int rl, int ru) { /** * Decompress the column group to the sparse transposed block. Note that the column groups would only need to * decompress into specific sub rows of the Sparse block - * + * * @param sb Sparse target block * @param nColOut The number of columns in the sb. */ @@ -199,7 +200,7 @@ public final void decompressToDenseBlock(DenseBlock db, int rl, int ru) { /** * Serializes column group to data output. - * + * * @param out data output * @throws IOException if IOException occurs */ @@ -212,7 +213,7 @@ protected void write(DataOutput out) throws IOException { /** * Returns the exact serialized size of column group. This can be used for example for buffer preallocation. - * + * * @return exact serialized size for column group */ public long getExactSizeOnDisk() { @@ -225,11 +226,11 @@ public long getExactSizeOnDisk() { /** * Slice out the columns within the range of cl and cu to remove the dictionary values related to these columns. If * the ColGroup slicing from does not contain any columns within the range null is returned. - * + * * @param cl The lower bound of the columns to select * @param cu The upper bound of the columns to select (not inclusive). * @return A cloned Column Group, with a copied pointer to the old column groups index structure, but reduced - * dictionary and _columnIndexes correctly aligned with the expected sliced compressed matrix. + * dictionary and _columnIndexes correctly aligned with the expected sliced compressed matrix. */ public final AColGroup sliceColumns(int cl, int cu) { if(cl <= _colIndexes.get(0) && cu > _colIndexes.get(_colIndexes.size() - 1)) { @@ -247,10 +248,10 @@ else if(cu - cl == 1) /** * Slice out a single column from the column group. - * + * * @param col The column to slice, the column could potentially not be inside the column group * @return A new column group that is a single column, if the column requested is not in this column group null is - * returned. + * returned. */ public final AColGroup sliceColumn(int col) { int idx = _colIndexes.findIndex(col); @@ -262,11 +263,11 @@ public final AColGroup sliceColumn(int col) { /** * Slice out multiple columns within the interval between the given indexes. - * + * * @param cl The lower column index to slice from * @param cu The upper column index to slice to, (not included) * @return A column group of this containing the columns specified, returns null if the columns specified is not - * contained in the column group + * contained in the column group */ protected final AColGroup sliceMultiColumns(int cl, int cu) { SliceResult sr = _colIndexes.slice(cl, cu); @@ -278,7 +279,7 @@ protected final AColGroup sliceMultiColumns(int cl, int cu) { /** * Compute the column sum of the given list of groups - * + * * @param groups The Groups to sum * @param res The result to put the values into * @param nRows The number of rows in the groups @@ -292,9 +293,9 @@ public static double[] colSum(Collection groups, double[] res, int nR /** * Get the value at a global row/column position. - * + * * In general this performs since a binary search of colIndexes is performed for each lookup. - * + * * @param r row * @param c column * @return value at the row/column position @@ -309,7 +310,7 @@ public double get(int r, int c) { /** * Get the value at a colGroup specific row/column index position. - * + * * @param r row * @param colIdx column index in the _colIndexes. * @return value at the row/column index position @@ -318,16 +319,16 @@ public double get(int r, int c) { /** * Obtain number of distinct tuples in contained sets of values associated with this column group. - * + * * If the column group is uncompressed the number or rows is returned. - * + * * @return the number of distinct sets of values associated with the bitmaps in this column group */ public abstract int getNumValues(); /** * Obtain the compression type. - * + * * @return How the elements of the column group are compressed. */ public abstract CompressionType getCompType(); @@ -335,14 +336,14 @@ public double get(int r, int c) { /** * Internally get the specific type of ColGroup, this could be extracted from the object but that does not allow for * nice switches in the code. - * + * * @return ColGroupType of the object. */ protected abstract ColGroupType getColGroupType(); /** * Decompress into the DenseBlock. (no NNZ handling) - * + * * @param db Target DenseBlock * @param rl Row to start decompression from * @param ru Row to end decompression at (not inclusive) @@ -353,10 +354,10 @@ public double get(int r, int c) { /** * Decompress into the SparseBlock. (no NNZ handling) - * + * * Note this method is allowing to calls to append since it is assumed that the sparse column indexes are sorted * afterwards - * + * * @param sb Target SparseBlock * @param rl Row to start decompression from * @param ru Row to end decompression at (not inclusive) @@ -367,9 +368,9 @@ public double get(int r, int c) { /** * Right matrix multiplication with this column group. - * + * * This method can return null, meaning that the output overlapping group would have been empty. - * + * * @param right The MatrixBlock on the right of this matrix multiplication * @return The new Column Group or null that is the result of the matrix multiplication. */ @@ -379,9 +380,9 @@ public final AColGroup rightMultByMatrix(MatrixBlock right) { /** * Right matrix multiplication with this column group. - * + * * This method can return null, meaning that the output overlapping group would have been empty. - * + * * @param right The MatrixBlock on the right of this matrix multiplication * @param allCols A pre-materialized list of all col indexes, that can be shared across all column groups if use * full, can be set to null. @@ -392,7 +393,7 @@ public final AColGroup rightMultByMatrix(MatrixBlock right) { /** * Right side Matrix multiplication, iterating though this column group and adding to the ret - * + * * @param right Right side matrix to multiply with. * @param ret The return matrix to add results to * @param rl The row of this column group to multiply from @@ -401,18 +402,20 @@ public final AColGroup rightMultByMatrix(MatrixBlock right) { * @param cru The right hand side column upper * @param nRows The number of rows in this column group */ - public void rightDecompressingMult(MatrixBlock right, MatrixBlock ret, int rl, int ru, int nRows, int crl, int cru){ - throw new NotImplementedException("not supporting right Decompressing Multiply on class: " + this.getClass().getSimpleName()); + public void rightDecompressingMult(MatrixBlock right, MatrixBlock ret, int rl, int ru, int nRows, int crl, + int cru) { + throw new NotImplementedException( + "not supporting right Decompressing Multiply on class: " + this.getClass().getSimpleName()); } /** * Do a transposed self matrix multiplication on the left side t(x) %*% x. but only with this column group. - * + * * This gives better performance since there is no need to iterate through all the rows of the matrix, but the * execution can be limited to its number of distinct values. - * + * * Note it only calculate the upper triangle - * + * * @param ret The return matrix block [numColumns x numColumns] * @param nRows The number of rows in the column group */ @@ -420,7 +423,7 @@ public void rightDecompressingMult(MatrixBlock right, MatrixBlock ret, int rl, i /** * Left multiply with this column group. - * + * * @param matrix The matrix to multiply with on the left * @param result The result to output the values into, always dense for the purpose of the column groups * parallelizing @@ -434,7 +437,7 @@ public abstract void leftMultByMatrixNoPreAgg(MatrixBlock matrix, MatrixBlock re /** * Left side matrix multiplication with a column group that is transposed. - * + * * @param lhs The left hand side Column group to multiply with, the left hand side should be considered * transposed. Also it should be guaranteed that this column group is not empty. * @param result The result matrix to insert the result of the multiplication into @@ -444,16 +447,16 @@ public abstract void leftMultByMatrixNoPreAgg(MatrixBlock matrix, MatrixBlock re /** * Matrix multiply with this other column group, but: - * + * * 1. Only output upper triangle values. - * + * * 2. Multiply both ways with "this" being on the left and on the right. - * + * * It should be guaranteed that the input is not the same as the caller of the method. - * + * * The second step is achievable by treating the initial multiplied matrix, and adding its values to the correct * locations in the output. - * + * * @param other The other Column group to multiply with * @param result The result matrix to put the results into */ @@ -462,7 +465,7 @@ public abstract void leftMultByMatrixNoPreAgg(MatrixBlock matrix, MatrixBlock re /** * Perform the specified scalar operation directly on the compressed column group, without decompressing individual * cells if possible. - * + * * @param op operation to perform * @return version of this column group with the operation applied */ @@ -470,7 +473,7 @@ public abstract void leftMultByMatrixNoPreAgg(MatrixBlock matrix, MatrixBlock re /** * Perform a binary row operation. - * + * * @param op The operation to execute * @param v The vector of values to apply the values contained should be at least the length of the highest * value in the column index @@ -481,7 +484,7 @@ public abstract void leftMultByMatrixNoPreAgg(MatrixBlock matrix, MatrixBlock re /** * Short hand add operator call on column group to add a row vector to the column group - * + * * @param v The vector to add * @return A new column group where the vector is added. */ @@ -491,7 +494,7 @@ public AColGroup addVector(double[] v) { /** * Perform a binary row operation. - * + * * @param op The operation to execute * @param v The vector of values to apply the values contained should be at least the length of the highest * value in the column index @@ -503,9 +506,9 @@ public AColGroup addVector(double[] v) { /** * Unary Aggregate operator, since aggregate operators require new object output, the output becomes an uncompressed * matrix. - * + * * The range of rl to ru only applies to row aggregates. (ReduceCol) - * + * * @param op The operator used * @param c The output matrix block * @param nRows The total number of rows in the Column Group @@ -516,9 +519,9 @@ public AColGroup addVector(double[] v) { /** * Slice out column at specific index of this column group. - * + * * It is guaranteed that the column to slice is contained in this columnGroup. - * + * * @param idx The column index to slice out. * @return A new column group containing the columns inside. (never null) */ @@ -526,9 +529,9 @@ public AColGroup addVector(double[] v) { /** * Slice range of columns inside this column group. - * + * * It is guaranteed that the columns to slice is contained in this columnGroup. - * + * * @param idStart The column index to start at * @param idEnd The column index to end at (not included) * @param outputCols The output columns to extract materialized for ease of implementation @@ -538,9 +541,10 @@ public AColGroup addVector(double[] v) { /** * Slice range of rows out of the column group and return a new column group only containing the row segment. - * - * Note that this slice should maintain pointers back to the original dictionaries and only modify index structures. - * + * + * Note that this slice should maintain pointers back to the original dictionaries and only modify index + * structures. + * * @param rl The row to start at * @param ru The row to end at (not included) * @return A new column group containing the specified row range. @@ -549,21 +553,21 @@ public AColGroup addVector(double[] v) { /** * Short hand method for getting minimum value contained in this column group. - * + * * @return The minimum value contained in this ColumnGroup */ public abstract double getMin(); /** * Short hand method for getting maximum value contained in this column group. - * + * * @return The maximum value contained in this ColumnGroup */ public abstract double getMax(); /** * Short hand method for getting the sum of this column group - * + * * @param nRows The number of rows in the column group * @return The sum of this column group */ @@ -571,7 +575,7 @@ public AColGroup addVector(double[] v) { /** * Detect if the column group contains a specific value. - * + * * @param pattern The value to look for. * @return boolean saying true if the value is contained. */ @@ -579,7 +583,7 @@ public AColGroup addVector(double[] v) { /** * Get the number of nonZeros contained in this column group. - * + * * @param nRows The number of rows in the column group, this is used for groups that does not contain information * about how many rows they have. * @return The nnz. @@ -588,7 +592,7 @@ public AColGroup addVector(double[] v) { /** * Make a copy of the column group values, and replace all values that match pattern with replacement value. - * + * * @param pattern The value to look for * @param replace The value to replace the other value with * @return A new Column Group, reusing the index structure but with new values. @@ -597,7 +601,7 @@ public AColGroup addVector(double[] v) { /** * Compute the column sum - * + * * @param c The array to add the column sum to. * @param nRows The number of rows in the column group. */ @@ -605,7 +609,7 @@ public AColGroup addVector(double[] v) { /** * Central Moment instruction executed on a column group. - * + * * @param op The Operator to use. * @param nRows The number of rows contained in the ColumnGroup. * @return A Central Moment object. @@ -614,7 +618,7 @@ public AColGroup addVector(double[] v) { /** * Expand the column group to multiple columns. (one hot encode the column group) - * + * * @param max The number of columns to expand to and cutoff values at. * @param ignore If zero and negative values should be ignored. * @param cast If the double values contained should be cast to whole numbers. @@ -625,7 +629,7 @@ public AColGroup addVector(double[] v) { /** * Get the computation cost associated with this column group. - * + * * @param e The computation cost estimator * @param nRows the number of rows in the column group * @return The cost of this column group @@ -634,7 +638,7 @@ public AColGroup addVector(double[] v) { /** * Perform unary operation on the column group and return a new column group - * + * * @param op The operation to perform * @return The new column group */ @@ -642,19 +646,19 @@ public AColGroup addVector(double[] v) { /** * Get if the group is only containing zero - * + * * @return true if empty */ public abstract boolean isEmpty(); /** - * Append the other column group to this column group. This method tries to combine them to return a new column group - * containing both. In some cases it is possible in reasonable time, in others it is not. - * + * Append the other column group to this column group. This method tries to combine them to return a new column + * group containing both. In some cases it is possible in reasonable time, in others it is not. + * * The result is first this column group followed by the other column group in higher row values. - * + * * If it is not possible or very inefficient null is returned. - * + * * @param g The other column group * @return A combined column group or null */ @@ -662,9 +666,9 @@ public AColGroup addVector(double[] v) { /** * Append all column groups in the list provided together in one go allocating the output once. - * + * * If it is not possible or very inefficient null is returned. - * + * * @param groups The groups to combine. * @param blen The normal number of rows in the groups * @param rlen The total number of rows of all combined. @@ -676,11 +680,11 @@ public static AColGroup appendN(AColGroup[] groups, int blen, int rlen) { /** * Append all column groups in the list provided together with this. - * + * * A Important detail is the first entry in the group == this, and should not be appended twice. - * + * * If it is not possible or very inefficient null is returned. - * + * * @param groups The groups to combine. * @param blen The normal number of rows in the groups * @param rlen The total number of rows of all combined. @@ -690,7 +694,7 @@ public static AColGroup appendN(AColGroup[] groups, int blen, int rlen) { /** * Get the compression scheme for this column group to enable compression of other data. - * + * * @return The compression scheme of this column group */ public abstract ICLAScheme getCompressionScheme(); @@ -704,14 +708,14 @@ public void clear() { /** * Recompress this column group into a new column group. - * + * * @return A new or the same column group depending on optimization goal. */ public abstract AColGroup recompress(); /** * Recompress this column group into a new column group of the given type. - * + * * @param ct The compressionType that the column group should morph into * @param nRow The number of rows in this columngroup. * @return A new column group @@ -741,7 +745,7 @@ else if(ct == CompressionType.UNCOMPRESSED) { /** * Get the compression info for this column group. - * + * * @param nRow The number of rows in this column group. * @return The compression info for this group. */ @@ -749,7 +753,7 @@ else if(ct == CompressionType.UNCOMPRESSED) { /** * Combine this column group with another - * + * * @param other The other column group to combine with. * @param nRow The number of rows in both column groups. * @return A combined representation as a column group. @@ -760,7 +764,7 @@ public AColGroup combine(AColGroup other, int nRow) { /** * Get encoding of this column group. - * + * * @return The encoding of the index structure. */ public IEncode getEncoding() { @@ -781,19 +785,19 @@ public AColGroup sortColumnIndexes() { /** * Perform row sum on the internal dictionaries, and return the same index structure. - * + * * This method returns null on empty column groups. - * + * * Note this method does not guarantee correct behavior if the given group is AMorphingGroup, instead it should be * morphed to a valid columngroup via extractCommon first. - * + * * @return The reduced colgroup. */ public abstract AColGroup reduceCols(); /** * Selection (left matrix multiply) - * + * * @param selection A sparse matrix with "max" a single one in each row all other values are zero. * @param points The coordinates in the selection matrix to extract. * @param ret The MatrixBlock to decompress the selected rows into @@ -806,17 +810,17 @@ public final void selectionMultiply(MatrixBlock selection, P[] points, MatrixBlo else denseSelection(selection, points, ret, rl, ru); } - + /** * Get an approximate sparsity of this column group - * + * * @return the approximate sparsity of this columngroup */ public abstract double getSparsity(); /** * Sparse selection (left matrix multiply) - * + * * @param selection A sparse matrix with "max" a single one in each row all other values are zero. * @param points The coordinates in the selection matrix to extract. * @param ret The Sparse MatrixBlock to decompress the selected rows into @@ -827,7 +831,7 @@ public final void selectionMultiply(MatrixBlock selection, P[] points, MatrixBlo /** * Dense selection (left matrix multiply) - * + * * @param selection A sparse matrix with "max" a single one in each row all other values are zero. * @param points The coordinates in the selection matrix to extract. * @param ret The Dense MatrixBlock to decompress the selected rows into @@ -839,7 +843,7 @@ public final void selectionMultiply(MatrixBlock selection, P[] points, MatrixBlo /** * Method to determine if the columnGroup have the same index structure as another. Note that the column indexes and * dictionaries are allowed to be different. - * + * * @param that the other column group * @return if the index is the same. */ @@ -850,7 +854,7 @@ public boolean sameIndexStructure(AColGroup that) { /** * C bind the list of column groups with this column group. the list of elements provided in the index of each list * is guaranteed to have the same index structures - * + * * @param nRow The number of rows contained in all right and this column group. * @param nCol The number of columns to shift the right hand side column groups over when combining, this should * only effect the column indexes @@ -888,7 +892,7 @@ public AColGroup combineWithSameIndex(int nRow, int nCol, List right) /** * C bind the given column group to this. - * + * * @param nRow The number of rows contained in the right and this column group. * @param nCol The number of columns in this. * @param right The column group to c-bind. @@ -928,16 +932,16 @@ protected IColIndex combineColIndexes(final int nCol, List right) { /** * This method returns a list of column groups that are naive splits of this column group as if it is reshaped. - * + * * This means the column groups rows are split into x number of other column groups where x is the multiplier. - * + * * The indexes are assigned round robbin to each of the output groups, meaning the first index is assigned. - * + * * If for instance the 4. column group is split by a 2 multiplier and there was 5 columns in total originally. The * output becomes 2 column groups at column index 4 and one at 9. - * + * * If possible the split column groups should reuse pointers back to the original dictionaries! - * + * * @param multiplier The number of column groups to split into * @param nRow The number of rows in this column group in case the underlying column group does not know * @param nColOrg The number of overall columns in the host CompressedMatrixBlock. @@ -947,25 +951,25 @@ protected IColIndex combineColIndexes(final int nCol, List right) { /** * This method returns a list of column groups that are naive splits of this column group as if it is reshaped. - * + * * This means the column groups rows are split into x number of other column groups where x is the multiplier. - * + * * The indexes are assigned round robbin to each of the output groups, meaning the first index is assigned. - * + * * If for instance the 4. column group is split by a 2 multiplier and there was 5 columns in total originally. The * output becomes 2 column groups at column index 4 and one at 9. - * + * * If possible the split column groups should reuse pointers back to the original dictionaries! - * + * * This specific variation is pushing down the parallelization given via the executor service provided. If not * overwritten the default is to call the normal split reshape - * + * * @param multiplier The number of column groups to split into * @param nRow The number of rows in this column group in case the underlying column group does not know * @param nColOrg The number of overall columns in the host CompressedMatrixBlock * @param pool The executor service to submit parallel tasks to - * @throws Exception In case there is an error we throw the exception out instead of handling it * @return a list of split column groups + * @throws Exception In case there is an error we throw the exception out instead of handling it */ public AColGroup[] splitReshapePushDown(final int multiplier, final int nRow, final int nColOrg, final ExecutorService pool) throws Exception { From bdb1550a8d23b226898598b82c2a2e84b40e6c92 Mon Sep 17 00:00:00 2001 From: mori49 <84979219+mori49@users.noreply.github.com> Date: Tue, 3 Mar 2026 10:49:31 +0100 Subject: [PATCH 24/35] add: second compression Method --- .../compress/colgroup/ColGroupFactory.java | 66 +++++++++++-------- 1 file changed, 39 insertions(+), 27 deletions(-) diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupFactory.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupFactory.java index b51111a4aba..833ab7196e1 100644 --- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupFactory.java +++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupFactory.java @@ -308,6 +308,9 @@ else if(ct == CompressionType.LinearFunctional) { else if(ct == CompressionType.PiecewiseLinear) { return compressPiecewiseLinearFunctional(colIndexes, in, cs); } + else if(ct == CompressionType.PiecewiseLinearSukzessive) { + return compressPiecewiseLinearFunctionalSukzessive(colIndexes, in, cs); + } else if(ct == CompressionType.DDCFOR) { AColGroup g = directCompressDDC(colIndexes, cg); if(g instanceof ColGroupDDC) @@ -1072,42 +1075,51 @@ private static AColGroup compressLinearFunctional(IColIndex colIndexes, MatrixBl return ColGroupLinearFunctional.create(colIndexes, coefficients, numRows); } - public static AColGroup compressPiecewiseLinearFunctional( - IColIndex colIndexes, MatrixBlock in, CompressionSettings cs) { + public static AColGroup compressPiecewiseLinearFunctional(IColIndex colIndexes, MatrixBlock in, + CompressionSettings cs) { final int numRows = in.getNumRows(); - AColGroup result = null; + final int numCols = colIndexes.size(); + int[][] breakpointsPerCol = new int[numCols][]; + double[][] slopesPerCol = new double[numCols][]; + double[][] interceptsPerCol = new double[numCols][]; + + for(int col = 0; col < numCols; col++) { + final int colIdx = colIndexes.get(col); + double[] column = PiecewiseLinearUtils.getColumn(in, colIdx); + PiecewiseLinearUtils.SegmentedRegression fit = PiecewiseLinearUtils.compressSegmentedLeastSquares(column, + cs); + breakpointsPerCol[col] = fit.getBreakpoints(); + interceptsPerCol[col] = fit.getIntercepts(); + slopesPerCol[col] = fit.getSlopes(); + + } + return ColGroupPiecewiseLinearCompressed.create(colIndexes, breakpointsPerCol, slopesPerCol, interceptsPerCol, + numRows); + + } - //Compress every column - for (int col = 0; col < colIndexes.size(); col++) { - // get Column Index - IColIndex.SliceResult sliceResult = colIndexes.slice(col, col + 1); - IColIndex singleColIndex = sliceResult.ret; // ← .ret nötig! + public static AColGroup compressPiecewiseLinearFunctionalSukzessive(IColIndex colIndexes, MatrixBlock in, + CompressionSettings cs) { + final int numRows = in.getNumRows(); + final int numCols = colIndexes.size(); + int[][] breakpointsPerCol = new int[numCols][]; + double[][] slopesPerCol = new double[numCols][]; + double[][] interceptsPerCol = new double[numCols][]; - // Get Column from Matrix + for(int col = 0; col < numCols; col++) { final int colIdx = colIndexes.get(col); double[] column = PiecewiseLinearUtils.getColumn(in, colIdx); + PiecewiseLinearUtils.SegmentedRegression fit = PiecewiseLinearUtils.compressIterativePiecewiseLinear(column, + cs); + breakpointsPerCol[col] = fit.getBreakpoints(); + interceptsPerCol[col] = fit.getIntercepts(); + slopesPerCol[col] = fit.getSlopes(); - //Compress column - PiecewiseLinearUtils.SegmentedRegression fit = - PiecewiseLinearUtils.compressSegmentedLeastSquares(column, cs); - - AColGroup singleGroup = ColGroupPiecewiseLinearCompressed.create( - singleColIndex, - fit.getBreakpoints(), - fit.getSlopes(), - fit.getIntercepts(), - numRows); - - // Combine multiple columns - if (result == null) { - result = singleGroup; - } else { - result = result.combineWithSameIndex(numRows, col, singleGroup); - } } + return ColGroupPiecewiseLinearCompressed.create(colIndexes, breakpointsPerCol, slopesPerCol, interceptsPerCol, + numRows); - return result; } private AColGroup compressSDCFromSparseTransposedBlock(IColIndex cols, int nrUniqueEstimate, double tupleSparsity) { From fd8eaf1d3d46883b83726e7749f4c5e31bc06edc Mon Sep 17 00:00:00 2001 From: mori49 <84979219+mori49@users.noreply.github.com> Date: Tue, 3 Mar 2026 10:53:10 +0100 Subject: [PATCH 25/35] fix: rename second compression Method --- .../apache/sysds/runtime/compress/colgroup/ColGroupFactory.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupFactory.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupFactory.java index 833ab7196e1..dfec14d2704 100644 --- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupFactory.java +++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupFactory.java @@ -1110,7 +1110,7 @@ public static AColGroup compressPiecewiseLinearFunctionalSukzessive(IColIndex co for(int col = 0; col < numCols; col++) { final int colIdx = colIndexes.get(col); double[] column = PiecewiseLinearUtils.getColumn(in, colIdx); - PiecewiseLinearUtils.SegmentedRegression fit = PiecewiseLinearUtils.compressIterativePiecewiseLinear(column, + PiecewiseLinearUtils.SegmentedRegression fit = PiecewiseLinearUtils.compressSukzessivePiecewiseLinear(column, cs); breakpointsPerCol[col] = fit.getBreakpoints(); interceptsPerCol[col] = fit.getIntercepts(); From a19ef6a9e14c126fc7d0d992aac30f65f2233702 Mon Sep 17 00:00:00 2001 From: mori49 <84979219+mori49@users.noreply.github.com> Date: Tue, 3 Mar 2026 11:14:54 +0100 Subject: [PATCH 26/35] add: second compression method --- .../functional/PiecewiseLinearUtils.java | 123 ++++++++---------- 1 file changed, 57 insertions(+), 66 deletions(-) diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/functional/PiecewiseLinearUtils.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/functional/PiecewiseLinearUtils.java index 7005be9de65..5b67cba2173 100644 --- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/functional/PiecewiseLinearUtils.java +++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/functional/PiecewiseLinearUtils.java @@ -38,6 +38,16 @@ public double[] getIntercepts() { } } + public static double[] getColumn(MatrixBlock in, int colIndex) { + final int numRows = in.getNumRows(); + final double[] column = new double[numRows]; + + for(int row = 0; row < numRows; row++) { + column[row] = in.get(row, colIndex); + } + return column; + } + public static SegmentedRegression compressSegmentedLeastSquares(double[] column, CompressionSettings cs) { //compute Breakpoints for a Column with dynamic Programming final List breakpointsList = computeBreakpoints(cs, column); @@ -49,7 +59,7 @@ public static SegmentedRegression compressSegmentedLeastSquares(double[] column, final double[] intercepts = new double[numSeg]; // Regress per Segment - for (int seg = 0; seg < numSeg; seg++) { + for(int seg = 0; seg < numSeg; seg++) { final int SegStart = breakpoints[seg]; final int SegEnd = breakpoints[seg + 1]; @@ -61,10 +71,10 @@ public static SegmentedRegression compressSegmentedLeastSquares(double[] column, return new SegmentedRegression(breakpoints, slopes, intercepts); } - public static SegmentedRegression compressSegmentedLeastSquaresV2(double[] column, CompressionSettings cs) { - //compute Breakpoints for a Column with Greedy Algorithm + public static SegmentedRegression compressSukzessivePiecewiseLinear(double[] column, CompressionSettings cs) { + //compute Breakpoints for a Column with a sukzessive breakpoints algorithm - final List breakpointsList = computeBreakpointsGreedy(column, cs); + final List breakpointsList = computeBreakpointSukzessive(column, cs); final int[] breakpoints = breakpointsList.stream().mapToInt(Integer::intValue).toArray(); //get values for Regression @@ -73,77 +83,53 @@ public static SegmentedRegression compressSegmentedLeastSquaresV2(double[] colu final double[] intercepts = new double[numSeg]; // Regress per Segment - for (int seg = 0; seg < numSeg; seg++) { + for(int seg = 0; seg < numSeg; seg++) { final int segstart = breakpoints[seg]; final int segEnd = breakpoints[seg + 1]; final double[] line = regressSegment(column, segstart, segEnd); slopes[seg] = line[0]; intercepts[seg] = line[1]; } - return new SegmentedRegression(breakpoints,slopes, intercepts); - } - - public static double[] getColumn(MatrixBlock in, int colIndex) { - final int numRows = in.getNumRows(); - final double[] column = new double[numRows]; - - for (int row = 0; row < numRows; row++) { - column[row] = in.get(row, colIndex); - } - return column; + return new SegmentedRegression(breakpoints, slopes, intercepts); } public static List computeBreakpoints(CompressionSettings cs, double[] column) { final int numElements = column.length; final double targetMSE = cs.getPiecewiseTargetLoss(); - - // TODO: Maybe remove Fallback if no targetloss is given - /*if (Double.isNaN(targetMSE) || targetMSE <= 0) { - final double segmentPenalty = 2.0 * Math.log(numElements); - return computeBreakpointsLambda(column, segmentPenalty); - }*/ - // max targetloss final double sseMax = numElements * targetMSE; - double minLoss = 0.0; - double maxLoss = numElements * 100.0; - List bestBreaks = null; - //compute breakpoints - while(maxLoss -minLoss > 1e-8) { - final double currentLoss = 0.5 * (minLoss + maxLoss); - final List breaks = computeBreakpointsLambda(column, currentLoss); - final double totalSSE = computeTotalSSE(column, breaks); - if (totalSSE <= sseMax) { + double lambda = 1000.0; // Regulationparam + List bestBreaks = Arrays.asList(0, numElements); + + for(int iter = 0; iter < 20; iter++) { // fixed Iterations + List breaks = computeBreakpointsLambda(column, lambda); + double totalSSE = computeTotalSSE(column, breaks); + + if(totalSSE <= sseMax) { bestBreaks = breaks; - minLoss = currentLoss; - } - else { - maxLoss = currentLoss; } + lambda *= 0.8; } - if (bestBreaks == null) - bestBreaks = computeBreakpointsLambda(column, minLoss); - return bestBreaks; } public static List computeBreakpointsLambda(double[] column, double lambda) { - final int numrows = column.length; - final double[] costs = new double[numrows + 1]; //min Cost - final int[] prevStart = new int[numrows + 1]; //previous Start + final int numRows = column.length; + final double[] costs = new double[numRows + 1]; //min Cost + final int[] prevStart = new int[numRows + 1]; //previous Start costs[0] = 0.0; // Find Cost - for (int rowEnd = 1; rowEnd <= numrows; rowEnd++) { + for(int rowEnd = 1; rowEnd <= numRows; rowEnd++) { costs[rowEnd] = Double.POSITIVE_INFINITY; //Test all possible Segment to find the lowest costs - for (int rowStart = 0; rowStart < rowEnd; rowStart++) { - //costs = current costs + segmentloss + penaltiy + for(int rowStart = 0; rowStart < rowEnd; rowStart++) { + //costs per Segment = current costs + segmentloss + penaltiy final double costCurrentSegment = computeSegmentCost(column, rowStart, rowEnd); final double totalCost = costs[rowStart] + costCurrentSegment + lambda; // Check if it is the better solution - if (totalCost < costs[rowEnd]) { + if(totalCost < costs[rowEnd]) { costs[rowEnd] = totalCost; prevStart[rowEnd] = rowStart; } @@ -151,8 +137,8 @@ public static List computeBreakpointsLambda(double[] column, double lam } //Check the optimal segmentlimits final List segmentLimits = new ArrayList<>(); - int breakpointIndex = numrows; - while (breakpointIndex > 0) { + int breakpointIndex = numRows; + while(breakpointIndex > 0) { segmentLimits.add(breakpointIndex); breakpointIndex = prevStart[breakpointIndex]; } @@ -163,7 +149,7 @@ public static List computeBreakpointsLambda(double[] column, double lam public static double computeSegmentCost(double[] column, int start, int end) { final int segSize = end - start; - if (segSize <= 1) + if(segSize <= 1) return 0.0; final double[] ab = regressSegment(column, start, end); //Regressionline @@ -171,7 +157,7 @@ public static double computeSegmentCost(double[] column, int start, int end) { final double intercept = ab[1]; double sumSquaredError = 0.0; - for (int i = start; i < end; i++) { + for(int i = start; i < end; i++) { final double rowIdx = i; final double actualValue = column[i]; final double predictedValue = slope * rowIdx + intercept; @@ -183,7 +169,7 @@ public static double computeSegmentCost(double[] column, int start, int end) { public static double computeTotalSSE(double[] column, List breaks) { double total = 0.0; - for (int s = 0; s < breaks.size() - 1; s++) { + for(int s = 0; s < breaks.size() - 1; s++) { final int start = breaks.get(s); final int end = breaks.get(s + 1); total += computeSegmentCost(column, start, end); @@ -193,11 +179,11 @@ public static double computeTotalSSE(double[] column, List breaks) { public static double[] regressSegment(double[] column, int start, int end) { final int numElements = end - start; - if (numElements <= 0) + if(numElements <= 0) return new double[] {0.0, 0.0}; double sumOfRowIndices = 0, sumOfColumnValues = 0, sumOfRowIndicesSquared = 0, productRowIndexTimesColumnValue = 0; - for (int i = start; i < end; i++) { + for(int i = start; i < end; i++) { final double x = i; final double y = column[i]; sumOfRowIndices += x; @@ -207,37 +193,42 @@ public static double[] regressSegment(double[] column, int start, int end) { } final double numPointsInSegmentDouble = numElements; - final double denominatorForSlope = numPointsInSegmentDouble * sumOfRowIndicesSquared - sumOfRowIndices * sumOfRowIndices; + final double denominatorForSlope = + numPointsInSegmentDouble * sumOfRowIndicesSquared - sumOfRowIndices * sumOfRowIndices; final double slope; final double intercept; - if (denominatorForSlope == 0) { + if(denominatorForSlope == 0) { slope = 0.0; intercept = sumOfColumnValues / numPointsInSegmentDouble; } else { - slope = (numPointsInSegmentDouble * productRowIndexTimesColumnValue - sumOfRowIndices * sumOfColumnValues) / denominatorForSlope; + slope = (numPointsInSegmentDouble * productRowIndexTimesColumnValue - sumOfRowIndices * sumOfColumnValues) / + denominatorForSlope; intercept = (sumOfColumnValues - slope * sumOfRowIndices) / numPointsInSegmentDouble; } return new double[] {slope, intercept}; } - public static List computeBreakpointsGreedy(double[] column, CompressionSettings cs) { + + public static List computeBreakpointSukzessive(double[] column, CompressionSettings cs) { final int numElements = column.length; final double targetMSE = cs.getPiecewiseTargetLoss(); - if (Double.isNaN(targetMSE) || targetMSE <= 0) { - return Arrays.asList(0, numElements); // Fallback: ein Segment + if(Double.isNaN(targetMSE) || targetMSE <= 0) { + return Arrays.asList(0, numElements); // Fallback one Segment if targetloss is not valid } List breakpoints = new ArrayList<>(); - breakpoints.add(0); + breakpoints.add(0); // first segment start is always 0 int currentStart = 0; - while (currentStart < numElements) { - int bestEnd = numElements; // Default: Rest als Segment - for (int end = currentStart + 1; end <= numElements; end++) { + while(currentStart < numElements) { + int bestEnd = numElements; + //Check all possible Ends for this one segment + for(int end = currentStart + 1; end <= numElements; end++) { double sse = computeSegmentCost(column, currentStart, end); + // Check if the loss for this segment is smaller/egual to the targetloss double sseMax = (end - currentStart) * targetMSE; - if (sse > sseMax) { - bestEnd = end - 1; // Letzter gültiger Endpunkt + if(sse > sseMax) { + bestEnd = end - 1; break; } } @@ -245,7 +236,7 @@ public static List computeBreakpointsGreedy(double[] column, Compressio currentStart = bestEnd; } - if (breakpoints.get(breakpoints.size() - 1) != numElements) { + if(breakpoints.get(breakpoints.size() - 1) != numElements) { breakpoints.add(numElements); } return breakpoints; From 6006168a57380111e82ce09a063a81c18f8e42f7 Mon Sep 17 00:00:00 2001 From: mori49 <84979219+mori49@users.noreply.github.com> Date: Tue, 3 Mar 2026 11:51:38 +0100 Subject: [PATCH 27/35] add: computing methods and operations --- .../ColGroupPiecewiseLinearCompressed.java | 409 +++++++++++++----- 1 file changed, 291 insertions(+), 118 deletions(-) diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupPiecewiseLinearCompressed.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupPiecewiseLinearCompressed.java index 35891eb8c53..d6ad0c6c421 100644 --- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupPiecewiseLinearCompressed.java +++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupPiecewiseLinearCompressed.java @@ -8,151 +8,370 @@ import org.apache.sysds.runtime.data.DenseBlock; import org.apache.sysds.runtime.data.SparseBlock; import org.apache.sysds.runtime.data.SparseBlockMCSR; -import org.apache.sysds.runtime.functionobjects.Builtin; +import org.apache.sysds.runtime.functionobjects.*; import org.apache.sysds.runtime.instructions.cp.CmCovObject; import org.apache.sysds.runtime.matrix.data.MatrixBlock; import org.apache.sysds.runtime.matrix.operators.BinaryOperator; import org.apache.sysds.runtime.matrix.operators.CMOperator; import org.apache.sysds.runtime.matrix.operators.ScalarOperator; import org.apache.sysds.runtime.matrix.operators.UnaryOperator; +import org.apache.sysds.utils.MemoryEstimates; import java.util.Arrays; public class ColGroupPiecewiseLinearCompressed extends AColGroupCompressed { - IColIndex colIndexes; - int[] breakpoints; - double[] slopes; - double[] intercepts; + int[][] breakpointsPerCol; + double[][] slopesPerCol; + double[][] interceptsPerCol; int numRows; protected ColGroupPiecewiseLinearCompressed(IColIndex colIndices) { super(colIndices); } - public ColGroupPiecewiseLinearCompressed(IColIndex colIndexes, int[] breakpoints, double[] slopes, - double[] intercepts, int numRows) { - super(colIndexes); - this.colIndexes = colIndexes; - this.breakpoints = breakpoints; - this.slopes = slopes; - this.intercepts = intercepts; + public ColGroupPiecewiseLinearCompressed(IColIndex colIndices, int[][] breakpoints, double[][] slopes, + double[][] intercepts, int numRows) { + super(colIndices); + this.breakpointsPerCol = breakpoints; + this.slopesPerCol = slopes; + this.interceptsPerCol = intercepts; this.numRows = numRows; } - public static AColGroup create(IColIndex colIndexes, int[] breakpoints, double[] slopes, double[] intercepts, - int numRows) { - if(breakpoints == null || breakpoints.length < 2) + public static AColGroup create(IColIndex colIndices, int[][] breakpointsPerCol, double[][] slopesPerCol, + double[][] interceptsPerCol, int numRows) { + int expectedCols = colIndices.size(); + if(breakpointsPerCol.length != expectedCols) + throw new IllegalArgumentException( + "bp.length=" + breakpointsPerCol.length + " != colIndices.size()=" + expectedCols); + if(breakpointsPerCol.length != colIndices.size()) throw new IllegalArgumentException("Need at least one segment"); - int numSeg = breakpoints.length - 1; - if(slopes.length != numSeg || intercepts.length != numSeg) - throw new IllegalArgumentException("Inconsistent segment arrays"); + for(int c = 0; c < colIndices.size(); c++) { + if(breakpointsPerCol[c].length < 1 || breakpointsPerCol[c][0] != 0 || + breakpointsPerCol[c][breakpointsPerCol[c].length - 1] != numRows) + throw new IllegalArgumentException( + "Invalid breakpoints for col " + c + ": must start=0, end=numRows, >=1 pts"); + + if(slopesPerCol[c].length != interceptsPerCol[c].length || + slopesPerCol[c].length != breakpointsPerCol[c].length - 1) + throw new IllegalArgumentException("Inconsistent array lengths col " + c); + } + + int numCols = colIndices.size(); + int[][] bpCopy = new int[numCols][]; + double[][] slopeCopy = new double[numCols][]; + double[][] interceptCopy = new double[numCols][]; - int[] bpCopy = Arrays.copyOf(breakpoints, breakpoints.length); - double[] slopeCopy = Arrays.copyOf(slopes, slopes.length); - double[] interceptCopy = Arrays.copyOf(intercepts, intercepts.length); + for(int c = 0; c < numCols; c++) { + bpCopy[c] = Arrays.copyOf(breakpointsPerCol[c], breakpointsPerCol[c].length); + slopeCopy[c] = Arrays.copyOf(slopesPerCol[c], slopesPerCol[c].length); + interceptCopy[c] = Arrays.copyOf(interceptsPerCol[c], interceptsPerCol[c].length); + } - return new ColGroupPiecewiseLinearCompressed(colIndexes, bpCopy, slopeCopy, interceptCopy, numRows); + return new ColGroupPiecewiseLinearCompressed(colIndices, bpCopy, slopeCopy, interceptCopy, numRows); } @Override public void decompressToDenseBlock(DenseBlock db, int rl, int ru, int offR, int offC) { - - //Safety-Check: - if(db == null || colIndexes == null || colIndexes.size() == 0 || breakpoints == null || slopes == null || - intercepts == null) { + if(db == null || _colIndexes == null || _colIndexes.size() == 0 || breakpointsPerCol == null || + slopesPerCol == null || interceptsPerCol == null) { return; } - //Validate Segments - int sizeSegment = breakpoints.length - 1; - if(sizeSegment <= 0 || rl >= ru) { - return; - } - //Find every Segment - final int column = _colIndexes.get(0); - for(int currentSeg = 0; currentSeg < sizeSegment; currentSeg++) { - int segStart = breakpoints[currentSeg]; - int segEnd = breakpoints[currentSeg + 1]; - if(segStart >= segEnd) - continue; - - double currentSlope = slopes[currentSeg]; - double currentIntercepts = intercepts[currentSeg]; - - int rowStart = Math.max(segStart, rl); - int rowEnd = Math.min(segEnd, ru); - if(rowStart >= rowEnd) - continue; - - // Filling DenseBlock Matrix - for(int r = rowStart; r < rowEnd; r++) { - double yhat = currentSlope * r + currentIntercepts; - int dbRow = offR + r; - int dbColumn = offC + column; - - if(dbRow >= 0 && dbRow < db.numRows() && dbColumn >= 0 && dbColumn < db.numCols()) { - db.set(dbRow, dbColumn, yhat); + for(int col = 0; col < _colIndexes.size(); col++) { + final int colIndex = _colIndexes.get(col); + int[] breakpoints = breakpointsPerCol[col]; + double[] slopes = slopesPerCol[col]; + double[] intercepts = interceptsPerCol[col]; + // per segment in this column + for(int seg = 0; seg + 1 < breakpoints.length; seg++) { // ← +1 statt length + int segStart = breakpoints[seg]; + int segEnd = breakpoints[seg + 1]; + if(segStart >= segEnd) + continue; + + double currentSlopeInSegment = slopes[seg]; + double currentInterceptInSegment = intercepts[seg]; + + int rowStart = Math.max(segStart, rl); + int rowEnd = Math.min(segEnd, ru); + if(rowStart >= rowEnd) + continue; + + //Fill DenseBlock für this column and Segment + for(int row = rowStart; row < rowEnd; row++) { + double yhat = currentSlopeInSegment * row + currentInterceptInSegment; + int dbRow = offR + row; + int dbCol = offC + colIndex; + + if(dbRow >= 0 && dbRow < db.numRows() && dbCol >= 0 && dbCol < db.numCols()) { + db.set(dbRow, dbCol, yhat); + } } + } + } } - public int[] getBreakpoints() { - return breakpoints; + public int[][] getBreakpointsPerCol() { + return breakpointsPerCol; } - public double[] getSlopes() { - return slopes; + public double[][] getSlopesPerCol() { + return slopesPerCol; } - public double[] getIntercepts() { - return intercepts; + public double[][] getInterceptsPerCol() { + return interceptsPerCol; } @Override public double getIdx(int r, int colIdx) { //Check if the rowIDx is valid (safety check) - if(r < 0 || r >= numRows || colIdx < 0 || colIdx >= colIndexes.size()) { + if(r < 0 || r >= numRows || colIdx < 0 || colIdx >= _colIndexes.size()) { return 0.0; } + int[] bps = breakpointsPerCol[colIdx]; + double[] slps = slopesPerCol[colIdx]; + double[] ints = interceptsPerCol[colIdx]; // Using Binary Search for efficient Search for the right Segment ( finding rowIdx r) - // have to use int higherBound = breakpoints.length - 2 because it's the last valid segment + // have to use int higherBound = breakpointsPerCol.length - 2 because it's the last valid segment int lowerBound = 0; - int higherBound = breakpoints.length - 2; + int higherBound = bps.length - 2; while(lowerBound <= higherBound) { int mid = (lowerBound + higherBound) / 2; - if(r < breakpoints[mid] + 1) { + if(r < bps[mid + 1]) { higherBound = mid - 1; } else lowerBound = mid + 1; } - int segment = Math.min(lowerBound, breakpoints.length - 2); - - return slopes[segment] * (double) r + intercepts[segment]; + int segment = Math.min(lowerBound, bps.length - 2); + return slps[segment] * (double) r + ints[segment]; } @Override public int getNumValues() { - return breakpoints.length + slopes.length + intercepts.length; + return breakpointsPerCol.length + slopesPerCol.length + interceptsPerCol.length; } @Override - protected double computeMxx(double c, Builtin builtin) { + public long getExactSizeOnDisk() { + long ret = super.getExactSizeOnDisk(); + int numCols = _colIndexes.size(); + ret += 8L * numCols * 3; + ret += 24L * 3; + + for(int c = 0; c < numCols; c++) { + ret += (long) MemoryEstimates.intArrayCost(breakpointsPerCol[c].length); + ret += (long) MemoryEstimates.doubleArrayCost(slopesPerCol[c].length); + ret += (long) MemoryEstimates.doubleArrayCost(interceptsPerCol[c].length); + } + + ret += 4L; + return ret; + + } + + @Override + public void computeSum(double[] c, int nRows) { + for(int col = 0; col < _colIndexes.size(); col++) { + double colSum = 0.0; + int[] breakpoints = breakpointsPerCol[col]; + double[] intercepts = interceptsPerCol[col]; + double[] slopes = slopesPerCol[col]; + for(int seg = 0; seg < breakpoints.length - 1; seg++) { + int start = breakpoints[seg], end = breakpoints[seg + 1]; + int len = end - start; + double b = intercepts[seg], m = slopes[seg]; + double sumR = (double) len * (len - 1) / 2.0; + colSum += (double) len * b + m * sumR; + } + c[col] += colSum; + } + } + + @Override + public void computeColSums(double[] c, int nRows) { + computeSum(c, nRows); + } + + @Override + public CompressionType getCompType() { + return CompressionType.PiecewiseLinear; + } + + @Override + protected ColGroupType getColGroupType() { + return ColGroupType.PiecewiseLinear; + } + + @Override + public AColGroup scalarOperation(ScalarOperator op) { + final int numCols = _colIndexes.size(); + double[][] newIntercepts = new double[numCols][]; + double[][] newSlopes = new double[numCols][]; + if(op.fn instanceof Plus || op.fn instanceof Minus) { + for(int col = 0; col < numCols; col++) { + int numSegments = interceptsPerCol[col].length; + newIntercepts[col] = new double[numSegments]; + newSlopes[col] = slopesPerCol[col].clone(); // Unverändert + for(int seg = 0; seg < numSegments; seg++) + newIntercepts[col][seg] = op.executeScalar(interceptsPerCol[col][seg]); + } // shift intercept + } + else if(op.fn instanceof Multiply || op.fn instanceof Divide) { + for(int col = 0; col < numCols; col++) { + int numSegments = interceptsPerCol[col].length; + newIntercepts[col] = new double[numSegments]; + newSlopes[col] = new double[numSegments]; + for(int seg = 0; seg < numSegments; seg++) { + newIntercepts[col][seg] = op.executeScalar(interceptsPerCol[col][seg]); + newSlopes[col][seg] = op.executeScalar(slopesPerCol[col][seg]); + } + }//shift slope and intercept + } + else { + throw new NotImplementedException("Unsupported scalar op"); + } + // new ColGroup because of changed slopes, intercepts + return new ColGroupPiecewiseLinearCompressed(_colIndexes, breakpointsPerCol, newSlopes, newIntercepts, numRows); + } + + @Override + public AColGroup binaryRowOpLeft(BinaryOperator op, double[] v, boolean isRowSafe) { + final int numCols = _colIndexes.size(); + double[][] newIntercepts = new double[numCols][]; + double[][] newSlopes = new double[numCols][]; + if(op.fn instanceof Plus || op.fn instanceof Minus) { + for(int col = 0; col < numCols; col++) { + double rowValue = v[_colIndexes.get(col)]; + int numSeg = interceptsPerCol[col].length; + newIntercepts[col] = new double[numSeg]; + newSlopes[col] = slopesPerCol[col].clone(); + for(int seg = 0; seg < numSeg; seg++) { + newIntercepts[col][seg] = op.fn.execute(rowValue, interceptsPerCol[col][seg]); + } + } + } + else if(op.fn instanceof Multiply || op.fn instanceof Divide) { + for(int col = 0; col < numCols; col++) { + double rowValue = v[_colIndexes.get(col)]; + int numSeg = interceptsPerCol[col].length; + newIntercepts[col] = new double[numSeg]; + newSlopes[col] = new double[numSeg]; + for(int seg = 0; seg < numSeg; seg++) { + newIntercepts[col][seg] = op.fn.execute(rowValue, interceptsPerCol[col][seg]); + newSlopes[col][seg] = op.fn.execute(rowValue, slopesPerCol[col][seg]); + } + } + } + else { + throw new NotImplementedException("Unsupported binary op"); + } + return new ColGroupPiecewiseLinearCompressed(_colIndexes, breakpointsPerCol, newSlopes, newIntercepts, numRows); + } + + @Override + public AColGroup binaryRowOpRight(BinaryOperator op, double[] v, boolean isRowSafe) { + final int numCols = _colIndexes.size(); + double[][] newIntercepts = new double[numCols][]; + double[][] newSlopes = new double[numCols][]; + if(op.fn instanceof Plus || op.fn instanceof Minus) { + for(int col = 0; col < numCols; col++) { + double rowValue = v[_colIndexes.get(col)]; + int numSeg = interceptsPerCol[col].length; + newIntercepts[col] = new double[numSeg]; + newSlopes[col] = slopesPerCol[col].clone(); + for(int seg = 0; seg < numSeg; seg++) { + newIntercepts[col][seg] = op.fn.execute(interceptsPerCol[col][seg], rowValue); + } + } + } + else if(op.fn instanceof Multiply || op.fn instanceof Divide) { + for(int col = 0; col < numCols; col++) { + double rowValue = v[_colIndexes.get(col)]; + int numSeg = interceptsPerCol[col].length; + newIntercepts[col] = new double[numSeg]; + newSlopes[col] = new double[numSeg]; + for(int seg = 0; seg < numSeg; seg++) { + newIntercepts[col][seg] = op.fn.execute(interceptsPerCol[col][seg], rowValue); + newSlopes[col][seg] = op.fn.execute(slopesPerCol[col][seg], rowValue); + } + } + } + else { + throw new NotImplementedException("Unsupported binary op"); + } + return new ColGroupPiecewiseLinearCompressed(_colIndexes, breakpointsPerCol, newSlopes, newIntercepts, numRows); + } + + @Override + public boolean containsValue(double pattern) { + for(int col = 0; col < _colIndexes.size(); col++) { + if(colContainsValue(col, pattern)) + return true; + } + return false; + } + + private boolean colContainsValue(int col, double pattern) { + int[] breakpoints = breakpointsPerCol[col]; + double[] intercepts = interceptsPerCol[col]; + double[] slopes = slopesPerCol[col]; + int numSeg = breakpoints.length - 1; + + for(int seg = 0; seg < numSeg; seg++) { + int start = breakpoints[seg]; + int end = breakpoints[seg + 1]; + int len = end - start; + if(len <= 0) + continue; + + double yIntercept = intercepts[seg]; + double slope = slopes[seg]; + + if(slope == 0.0) { + if(Double.compare(yIntercept, pattern) == 0) + return true; + continue; + } + + if(Double.compare(yIntercept, pattern) == 0) + return true; + + double endVal = yIntercept + slope * (len - 1); + if(Double.compare(endVal, pattern) == 0) + return true; + + double rowIndex = (pattern - yIntercept) / slope; + if(rowIndex > 0 && rowIndex < (len - 1) && Double.compare(yIntercept + slope * rowIndex, pattern) == 0) + return true; + } + return false; + } + + @Override + public AColGroup unaryOperation(UnaryOperator op) { throw new NotImplementedException(); } @Override - protected void computeColMxx(double[] c, Builtin builtin) { + public AColGroup replace(double pattern, double replace) { throw new NotImplementedException(); } @Override - protected void computeSum(double[] c, int nRows) { + protected double computeMxx(double c, Builtin builtin) { throw new NotImplementedException(); + } + @Override + protected void computeColMxx(double[] c, Builtin builtin) { + throw new NotImplementedException(); } @Override @@ -245,16 +464,6 @@ public void decompressToSparseBlockTransposed(SparseBlockMCSR sb, int nColOut) { } - @Override - public CompressionType getCompType() { - throw new NotImplementedException(); - } - - @Override - protected ColGroupType getColGroupType() { - throw new NotImplementedException(); - } - @Override public void decompressToSparseBlock(SparseBlock sb, int rl, int ru, int offR, int offC) { throw new NotImplementedException(); @@ -284,21 +493,6 @@ public void tsmmAColGroup(AColGroup other, MatrixBlock result) { } - @Override - public AColGroup scalarOperation(ScalarOperator op) { - throw new NotImplementedException(); - } - - @Override - public AColGroup binaryRowOpLeft(BinaryOperator op, double[] v, boolean isRowSafe) { - throw new NotImplementedException(); - } - - @Override - public AColGroup binaryRowOpRight(BinaryOperator op, double[] v, boolean isRowSafe) { - throw new NotImplementedException(); - } - @Override protected AColGroup sliceSingleColumn(int idx) { throw new NotImplementedException(); @@ -314,27 +508,11 @@ public AColGroup sliceRows(int rl, int ru) { throw new NotImplementedException(); } - @Override - public boolean containsValue(double pattern) { - throw new NotImplementedException(); - } - @Override public long getNumberNonZeros(int nRows) { throw new NotImplementedException(); } - @Override - public AColGroup replace(double pattern, double replace) { - throw new NotImplementedException(); - } - - @Override - public void computeColSums(double[] c, int nRows) { - throw new NotImplementedException(); - - } - @Override public CmCovObject centralMoment(CMOperator op, int nRows) { throw new NotImplementedException(); @@ -350,11 +528,6 @@ public double getCost(ComputationCostEstimator e, int nRows) { throw new NotImplementedException(); } - @Override - public AColGroup unaryOperation(UnaryOperator op) { - throw new NotImplementedException(); - } - @Override public AColGroup append(AColGroup g) { throw new NotImplementedException(); From 8b189100aae5e3a6a71cb2e22997b2714bb0c784 Mon Sep 17 00:00:00 2001 From: mori49 <84979219+mori49@users.noreply.github.com> Date: Tue, 3 Mar 2026 12:10:23 +0100 Subject: [PATCH 28/35] wip: removed Test, used more on random generated matrices, add new ones --- ...ColGroupPiecewiseLinearCompressedTest.java | 1218 ++++++++++------- 1 file changed, 712 insertions(+), 506 deletions(-) diff --git a/src/test/java/org/apache/sysds/test/component/compress/colgroup/ColGroupPiecewiseLinearCompressedTest.java b/src/test/java/org/apache/sysds/test/component/compress/colgroup/ColGroupPiecewiseLinearCompressedTest.java index fa1f88fab98..1672da79704 100644 --- a/src/test/java/org/apache/sysds/test/component/compress/colgroup/ColGroupPiecewiseLinearCompressedTest.java +++ b/src/test/java/org/apache/sysds/test/component/compress/colgroup/ColGroupPiecewiseLinearCompressedTest.java @@ -12,16 +12,20 @@ import org.apache.sysds.runtime.compress.estim.CompressedSizeInfoColGroup; import org.apache.sysds.runtime.compress.estim.EstimationFactors; import org.apache.sysds.runtime.data.DenseBlock; +import org.apache.sysds.runtime.functionobjects.Multiply; +import org.apache.sysds.runtime.functionobjects.Plus; import org.apache.sysds.runtime.matrix.data.MatrixBlock; +import org.apache.sysds.runtime.matrix.operators.BinaryOperator; +import org.apache.sysds.runtime.matrix.operators.RightScalarOperator; import org.apache.sysds.runtime.util.DataConverter; import org.apache.sysds.test.AutomatedTestBase; import org.junit.Test; import java.util.Arrays; import java.util.List; +import java.util.Random; import static org.apache.sysds.runtime.compress.colgroup.functional.PiecewiseLinearUtils.*; -import static org.apache.sysds.test.functions.io.binary.BlocksizeTest.sparsity; import static org.junit.Assert.*; public class ColGroupPiecewiseLinearCompressedTest extends AutomatedTestBase { @@ -30,724 +34,926 @@ public void setUp() { } - @Test - public void testComputeBreakpointsUniformColumn() { - CompressionSettings cs = new CompressionSettingsBuilder().create(); - cs.setPiecewiseTargetLoss(1e-3); - double[] column = {1.0, 1.0, 1.0, 1.0, 1.0}; // ← Test-spezifisch - List breaks = computeBreakpoints(cs, column); - assertEquals(Arrays.asList(0, 5), breaks); // Erwartet: keine Breaks - } + private static final long SEED = 42L; @Test - public void testComputeBreakpointsLinearIncreasing() { - CompressionSettings cs = new CompressionSettingsBuilder().create(); - cs.setPiecewiseTargetLoss(1e-3); - double[] column = {0.0, 1.0, 2.0, 3.0, 4.0}; // ← andere column - List breaks = computeBreakpoints(cs, column); - assertEquals(Arrays.asList(0, 5), breaks); // Erwartet - - } + public void testCompressPiecewiseLinearFunctionalRandom() { + // Generate random data + final int nrows = 50, ncols = 3; + double[][] data = getRandomMatrix(nrows, ncols, -3, 3, 1.0, SEED); + MatrixBlock in = DataConverter.convertToMatrixBlock(data); + in.allocateDenseBlock(); + // extract columns + double[][] columns = new double[ncols][nrows]; + for(int c = 0; c < ncols; c++) + for(int r = 0; r < nrows; r++) + columns[c][r] = data[r][c]; + // create ColIndexes + int[] colArray = {0, 1, 2}; + IColIndex colIndexes = ColIndexFactory.create(colArray); - @Test - public void testComputeBreakpointsTwoSegments() { + // set targetloss CompressionSettings cs = new CompressionSettingsBuilder().create(); - cs.setPiecewiseTargetLoss(1e-3); - // {1,1,1, 2,2,2} → 2 Segmente → [0,3,6] - double[] column = {1.0, 1.0, 1.0, 2.0, 2.0, 2.0}; - var breaks = computeBreakpoints(cs, column); - assertEquals(Arrays.asList(0, 3, 6), breaks); - } - + cs.setPiecewiseTargetLoss(25.0); + // compress + AColGroup result = ColGroupFactory.compressPiecewiseLinearFunctional(colIndexes, in, cs); + assertTrue(result instanceof ColGroupPiecewiseLinearCompressed); + ColGroupPiecewiseLinearCompressed plGroup = (ColGroupPiecewiseLinearCompressed) result; - @Test - public void testComputeBreakpointsLambdaConst() { - double[] column = {1.0, 1.0, 1.0, 1.0, 1.0}; - List breaks = computeBreakpointsLambda(column, 5.0); - assertEquals(Arrays.asList(0, 5), breaks); + // check structure + int[][] bp = plGroup.getBreakpointsPerCol(); + assertEquals(3, bp.length); // 3 Spalten + assertEquals(3, colIndexes.size()); - breaks = computeBreakpointsLambda(column, 0.01); - assertEquals(Arrays.asList(0, 5), breaks); - } + for(int c = 0; c < ncols; c++) { + assertEquals(0, bp[c][0]); // start with 0 + assertEquals(nrows, bp[c][bp[c].length - 1]); + assertTrue(bp[c].length >= 2); // Mind. 1 Segment + } - @Test - public void testComputeBreakpointsLambdaTwoSegments() { - double[] column = {1.0, 1.0, 1.0, 2.0, 2.0, 2.0}; // 6 Werte + double[][] slopes = plGroup.getSlopesPerCol(); + double[][] intercepts = plGroup.getInterceptsPerCol(); + assertEquals(3, slopes.length); + for(int c = 0; c < ncols; c++) { + assertEquals(bp[c].length - 1, slopes[c].length); + assertEquals(bp[c].length - 1, intercepts[c].length); + } - // mit kleinem lambda -> viele Segmente (kostenlos fast) - List breaks = computeBreakpointsLambda(column, 0.01); - assertTrue(breaks.contains(3)); - assertEquals(3, breaks.size()); - assertEquals(Arrays.asList(0, 3, 6), breaks); + // check col indexes shouldnt change + assertEquals(3, plGroup.getColIndices().size()); - // mit großem lambda entspricht nur ein Segment - breaks = computeBreakpointsLambda(column, 1000.0); - assertEquals(Arrays.asList(0, 6), breaks); + // decompress + MatrixBlock recon = new MatrixBlock(nrows, ncols, false); + recon.allocateDenseBlock(); + plGroup.decompressToDenseBlock(recon.getDenseBlock(), 0, nrows, 0, ncols - 1); + assertFalse(Double.isNaN(recon.get(0, 0))); } - @Test - public void testComputeBreakpointsLambdaJumpWithTrend() { - double[] column = {0.0, 1.0, 2.0, 10.0, 11.0, 12.0}; - - // grobe Segmentanpassung: ein Segment pro „Abschnitt“ - List breaks = computeBreakpointsLambda(column, 0.5); - assertEquals(Arrays.asList(0, 3, 6), breaks); + private void testCompressStructure(double[][] data) { + final int nrows = data.length, ncols = data[0].length; + MatrixBlock in = DataConverter.convertToMatrixBlock(data); + in.allocateDenseBlock(); - // nur ein Segment, wenn lambda sehr groß - breaks = computeBreakpointsLambda(column, 100.0); - assertEquals(Arrays.asList(0, 6), breaks); - } + int[] colArray = new int[ncols]; + for(int i = 0; i < ncols; i++) + colArray[i] = i; + IColIndex colIndexes = ColIndexFactory.create(colArray); + CompressionSettings cs = new CompressionSettingsBuilder().create(); + cs.setPiecewiseTargetLoss(100.0); - @Test - public void testComputeBreakpointsLambdaLinear() { - double[] column = {0.0, 1.0, 2.0, 3.0, 4.0, 5.0}; + AColGroup result = ColGroupFactory.compressPiecewiseLinearFunctional(colIndexes, in, cs); + assertTrue(result instanceof ColGroupPiecewiseLinearCompressed); + ColGroupPiecewiseLinearCompressed plGroup = (ColGroupPiecewiseLinearCompressed) result; - List breaks = computeBreakpointsLambda(column, 1.0); - assertEquals(Arrays.asList(0, 6), breaks); + int[][] bp = plGroup.getBreakpointsPerCol(); + assertEquals(ncols, bp.length); + for(int c = 0; c < ncols; c++) { + assertEquals(0, bp[c][0]); + assertEquals(nrows, bp[c][bp[c].length - 1]); + } + double[][] slopes = plGroup.getSlopesPerCol(); + assertEquals(ncols, slopes.length); + for(int c = 0; c < ncols; c++) { + assertEquals(bp[c].length - 1, slopes[c].length); + } + assertEquals(ncols, plGroup.getColIndices().size()); - // mit sehr kleinem lambda: wir prüfen nur, dass die Grenzen vernünftig sind - breaks = computeBreakpointsLambda(column, 0.001); - assertTrue(breaks.size() >= 2); - assertTrue(breaks.get(0) == 0); - assertTrue(breaks.get(breaks.size() - 1) == column.length); + MatrixBlock recon = new MatrixBlock(nrows, ncols, false); + recon.allocateDenseBlock(); + plGroup.decompressToDenseBlock(recon.getDenseBlock(), 0, nrows, 0, ncols - 1); } @Test - public void testComputeBreakpointsLambdaEdgeLambdaVerySmall() { - double[] column = {1.0, 1.1, 1.0, 1.1, 1.0}; + public void testCompressTrendNoise() { + final int nrows = 100, ncols = 2; + Random rng = new Random(SEED); + double[][] data = new double[nrows][ncols]; - List breaks = computeBreakpointsLambda(column, 0.001); - assertNotNull(breaks); - assertFalse(breaks.isEmpty()); - assertEquals(0, (int) breaks.get(0)); - assertEquals(column.length, (int) breaks.get(breaks.size() - 1)); - - // Prüfe, dass die Liste sortiert ist - for(int i = 1; i < breaks.size(); i++) { - assertTrue(breaks.get(i) >= breaks.get(i - 1)); + for(int r = 0; r < nrows; r++) { + double trend = 0.05 * r; + for(int c = 0; c < ncols; c++) { + data[r][c] = trend + rng.nextGaussian() * 1.5 + c * 2.0; + } } + + testCompressStructure(data); } @Test - public void testComputeBreakpointsLambdaEdgeLambdaVeryLarge() { - double[] column = {1.0, 2.0, 1.5, 2.5, 1.8}; - - List breaks = computeBreakpointsLambda(column, 1000.0); - assertEquals(Arrays.asList(0, 5), breaks); + public void testCompressJumps() { + final int nrows = 80, ncols = 3; + double[][] data = getRandomMatrix(nrows, ncols, -2, 2, 1.0, SEED); + for(int c = 0; c < ncols; c++) { + for(int r = 25; r < 55; r++) + data[r][c] += 8.0; + for(int r = 55; r < nrows; r++) + data[r][c] += 15.0; + } + testCompressStructure(data); } @Test - public void testComputeSegmentCostEmptyOrSingle() { - double[] column = {10.0, 20.0, 30.0}; - - // 0 Elemente (leer) - assertEquals(0.0, computeSegmentCost(column, 0, 0), 1e-10); - assertEquals(0.0, computeSegmentCost(column, 1, 1), 1e-10); - - // 1 Element → Regressionsgerade ist nicht eindeutig definiert, aber SSE=0 - assertEquals(0.0, computeSegmentCost(column, 0, 1), 1e-10); - assertEquals(0.0, computeSegmentCost(column, 1, 2), 1e-10); - assertEquals(0.0, computeSegmentCost(column, 2, 3), 1e-10); + public void testCompressHighFreq() { + final int nrows = 100, ncols = 50; + Random rng = new Random(SEED); + double[][] data = new double[nrows][ncols]; + for(int r = 0; r < nrows; r++) { + double sine = Math.sin(r * 0.4) * 4.0; + for(int c = 0; c < ncols; c++) { + data[r][c] = sine + rng.nextGaussian() * 0.8 + Math.sin(r * 0.2 + c) * 2.0; + } + } + testCompressStructure(data); } @Test - public void testComputeSegmentCostTwoConstantPoints() { - double[] column = {5.0, 5.0, 1.0, 1.0}; - - // Zwei identische Punkte (konstant) → SSE = 0 - double sse = computeSegmentCost(column, 0, 2); - assertEquals(0.0, sse, 1e-10); + public void testCompressSingleLowVariance() { + final int nrows = 50, ncols = 1; + double[][] data = getRandomMatrix(nrows, ncols, -1, 1, 1.0, SEED); + testCompressStructure(data); } @Test - public void testComputeSegmentCostTwoDifferentPoints() { - double[] column = {0.0, 2.0, 1.0, 3.0}; + public void testCompressSingleColumnStructure() { + double[][] data = getRandomMatrix(50, 1, -1, 1, 1.0, SEED); + testCompressStructure(data); + } - // Zwei Punkte: (0,0) und (1,2) → Gerade y = 2*x, Fehler = 0 - double sse = computeSegmentCost(column, 0, 2); - assertEquals(0.0, sse, 1e-10); + @Test(expected = NullPointerException.class) // ← Dein realer Crash! + public void testCreateNullBreakpoints() { + IColIndex cols = ColIndexFactory.create(new int[] {0}); + int[][] nullBp = {null}; + ColGroupPiecewiseLinearCompressed.create(cols, nullBp, new double[][] {{1.0}}, new double[][] {{0.0}}, 10); + } - // Zwei Punkte: (2,1) und (3,3) → Gerade y = 2*x - 3, Fehler = 0 - sse = computeSegmentCost(column, 2, 4); - assertEquals(0.0, sse, 1e-10); + @Test(expected = IllegalArgumentException.class) + public void testCreateTooFewBreakpoints() { + int[][] singleBp = {new int[] {0}}; + ColGroupPiecewiseLinearCompressed.create(ColIndexFactory.create(new int[] {0}), singleBp, + new double[][] {new double[] {1.0}}, new double[][] {new double[] {0.0}}, 10); } - @Test - public void testComputeSegmentCostConstantThree() { - double[] column = {0.0, 0.0, 0.0}; - double sse = computeSegmentCost(column, 0, 3); - assertEquals(0.0, sse, 1e-10); + @Test(expected = IllegalArgumentException.class) + public void testCreateInconsistentSlopes() { + int[] bp = {0, 5, 10}; + ColGroupPiecewiseLinearCompressed.create(ColIndexFactory.create(new int[] {0}), new int[][] {bp}, + new double[][] {new double[] {1.0, 2.0, 3.0}}, new double[][] {new double[] {0.0, 1.0}}, 10); } - @Test - public void testComputeSegmentCostConsistentWithRegression() { - double[] column = {0.0, 2.0, 0.0, 4.0, 0.0, 6.0}; + @Test(expected = IllegalArgumentException.class) + public void testCreateInconsistentIntercepts() { + int[] bp = {0, 5, 10}; + ColGroupPiecewiseLinearCompressed.create(ColIndexFactory.create(new int[] {0}), new int[][] {bp}, + new double[][] {new double[] {1.0, 2.0}}, new double[][] {new double[] {0.0}}, 10); + } - int start = 0, end = 3; - double[] ab = regressSegment(column, start, end); - double slope = ab[0], intercept = ab[1]; - double sse_hand = 0.0; - for(int i = start; i < end; i++) { - double yhat = slope * i + intercept; - double diff = column[i] - yhat; - sse_hand += diff * diff; + private int findSegment(int[] bps, int r) { + for(int s = 0; s < bps.length - 1; s++) { + if(r < bps[s + 1]) + return s; } - - double sse = computeSegmentCost(column, start, end); - assertEquals(sse_hand, sse, 1e-10); + return bps.length - 2; } - @Test - public void testComputeTotalSSETwoSegments() { - // Beispiel: [0,0,0] und [1,1,1] (jeweils konstant) - double[] column = {0.0, 0.0, 0.0, 1.0, 1.0, 1.0}; - List breaks = Arrays.asList(0, 3, 6); // zwei Segmente + public void testCreateValidMultiSegmentRandom() { + Random rng = new Random(SEED); + final int nrows = 20; - double total = computeTotalSSE(column, breaks); - double sse1 = computeSegmentCost(column, 0, 3); // [0,0,0] → SSE = 0 - double sse2 = computeSegmentCost(column, 3, 6); // [1,1,1] → SSE = 0 + int[][] bp = {{0, rng.nextInt(5) + 3, rng.nextInt(10) + 8, nrows}, {0, rng.nextInt(8) + 2, nrows}}; + double[][] slopes = {{rng.nextDouble() * 3 - 1.5, rng.nextDouble() * 3 - 1.5, rng.nextDouble() * 3 - 1.5}, + {rng.nextDouble() * 3 - 1.5, rng.nextDouble() * 3 - 1.5}}; + double[][] intercepts = {{rng.nextDouble() * 2 - 1, rng.nextDouble() * 2 - 1, rng.nextDouble() * 2 - 1}, + {rng.nextDouble() * 2 - 1, rng.nextDouble() * 2 - 1}}; + + IColIndex cols = ColIndexFactory.create(new int[] {0, 1}); + AColGroup cg = ColGroupPiecewiseLinearCompressed.create(cols, bp, slopes, intercepts, nrows); - // da beide Segmente konstant sind, muss totalSSE = 0 sein - assertEquals(0.0, total, 1e-10); - assertEquals(sse1 + sse2, total, 1e-10); + assertTrue(cg instanceof ColGroupPiecewiseLinearCompressed); + ColGroupPiecewiseLinearCompressed pl = (ColGroupPiecewiseLinearCompressed) cg; + assertNotSame(bp, pl.getBreakpointsPerCol()); + assertEquals(2, pl.getBreakpointsPerCol().length); + + for(int c = 0; c < 2; c++) { + for(int r = 0; r < nrows; r++) { + int seg = findSegment(bp[c], r); + double expected = slopes[c][seg] * r + intercepts[c][seg]; + assertEquals(expected, cg.getIdx(r, c), 1e-8); + } + } } @Test - public void testComputeTotalSSEThreeSegments() { - // Ein Segment mit drei identischen Werten, zwei Segmente mit jeweils zwei Werten - double[] column = {1.0, 1.0, 1.0, 2.0, 2.0, 3.0, 3.0}; - List breaks = Arrays.asList(0, 3, 5, 7); + public void testCreateMultiColumnRandom() { + Random rng = new Random(SEED); + final int nrows = 80, numGlobalCols = 5; + int[] globalCols = {2, 7, 12, 25, 42}; - // Segment [0,3): konstant 1.0 → SSE = 0 - double sse1 = computeSegmentCost(column, 0, 3); // 0 + int numSegs = rng.nextInt(3) + 1; + int[][] bp = new int[numGlobalCols][numSegs + 1]; + double[][] slopes = new double[numGlobalCols][numSegs]; + double[][] intercepts = new double[numGlobalCols][numSegs]; - // Segment [3,5): [2,2] → SSE = 0 - double sse2 = computeSegmentCost(column, 3, 5); // 0 + double slope = rng.nextDouble() * 4 - 2; + double intercept = rng.nextDouble() * 4 - 2; + for(int c = 0; c < numGlobalCols; c++) { + bp[c][0] = 0; + bp[c][numSegs] = nrows; + for(int s = 1; s < numSegs; s++) + bp[c][s] = rng.nextInt(nrows - 10) + 5; + Arrays.fill(slopes[c], slope); + Arrays.fill(intercepts[c], intercept); + } - // Segment [5,7): [3,3] → SSE = 0 - double sse3 = computeSegmentCost(column, 5, 7); // 0 + IColIndex cols = ColIndexFactory.create(globalCols); + AColGroup cg = ColGroupPiecewiseLinearCompressed.create(cols, bp, slopes, intercepts, nrows); + + assertTrue(cg.getNumValues() > 0); + assertEquals(numGlobalCols, cols.size()); - double total = computeTotalSSE(column, breaks); - assertEquals(0.0, total, 1e-10); - assertEquals(sse1 + sse2 + sse3, total, 1e-10); + for(int r = 0; r < nrows; r++) { + double expected = slope * r + intercept; + for(int localC = 0; localC < numGlobalCols; localC++) { + assertEquals(expected, cg.getIdx(r, localC), 1e-8); + } + } } @Test - public void testComputeTotalSSEGapStartEnd() { - double[] column = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0}; - List breaks = Arrays.asList(2, 5, 8); - - double total = computeTotalSSE(column, breaks); - double sse1 = computeSegmentCost(column, 2, 5); - double sse2 = computeSegmentCost(column, 5, 8); + public void testCreateSingleColumnRandom() { + Random rng = new Random(SEED); + final int nrows = rng.nextInt(30) + 20; + int numSegs = rng.nextInt(3) + 1; - assertEquals(sse1 + sse2, total, 1e-10); + int[] bp = new int[numSegs + 1]; + bp[0] = 0; + bp[numSegs] = nrows; + for(int s = 1; s < numSegs; s++) + bp[s] = rng.nextInt(nrows / 2) + 5; - } + double[] slopes = new double[numSegs]; + double[] intercepts = new double[numSegs]; + for(int s = 0; s < numSegs; s++) { + slopes[s] = rng.nextDouble() * 4 - 2; + intercepts[s] = rng.nextDouble() * 4 - 2; + } - @Test - public void testComputeTotalSSEOneSegmentIdentical() { - double[] column = {1.0, 2.0, 3.0, 4.0, 5.0}; - double sseTotal = computeSegmentCost(column, 0, 5); + IColIndex cols = ColIndexFactory.create(new int[] {rng.nextInt(50)}); + int[][] bp2d = {bp}; + double[][] slopes2d = {slopes}; + double[][] ints2d = {intercepts}; - List breaks = Arrays.asList(0, 5); - double total = computeTotalSSE(column, breaks); + AColGroup cg = ColGroupPiecewiseLinearCompressed.create(cols, bp2d, slopes2d, ints2d, nrows); - assertEquals(sseTotal, total, 1e-10); + for(int r = 0; r < nrows; r++) { + int seg = findSegment(bp, r); + double expected = slopes[seg] * r + intercepts[seg]; + assertEquals(expected, cg.getIdx(r, 0), 1e-8); + } } @Test - public void testComputeTotalSSENonConstant() { - double[] column = {0.0, 1.0, 2.0, 3.0, 4.0}; - List breaks = Arrays.asList(0, 2, 5); + public void testDecompressToDenseBlock() { + int[][] bp = {{0, 5, 10}}; + double[][] slopes = {{1.0, 2.0}}; + double[][] intercepts = {{0.0, 1.0}}; + int numRows = 10; + + IColIndex cols = ColIndexFactory.create(new int[] {0}); + AColGroup cg = ColGroupPiecewiseLinearCompressed.create(cols, bp, slopes, intercepts, numRows); - double total = computeTotalSSE(column, breaks); - double sse1 = computeSegmentCost(column, 0, 2); - double sse2 = computeSegmentCost(column, 2, 5); + MatrixBlock target = new MatrixBlock(numRows, 1, false); + target.allocateDenseBlock(); - assertTrue(total >= 0.0); - assertEquals(sse1 + sse2, total, 1e-10); - } + DenseBlock db = target.getDenseBlock(); + assertNotNull("DenseBlock null?", db); - @Test - public void testComputeTotalSSEEdgeCases() { - double[] columnEmpty = {}; - List breaksEmpty = Arrays.asList(0, 0); - assertEquals(0.0, computeTotalSSE(columnEmpty, breaksEmpty), 1e-10); + cg.decompressToDenseBlock(db, 0, numRows, 0, 0); + + for(int r = 0; r < numRows; r++) { + double expected = (r < 5) ? (1.0 * r + 0.0) : (2.0 * r + 1.0); + assertEquals("Row " + r + " mismatch", expected, db.get(r, 0), 1e-9); + } - double[] columnOne = {42.0}; - List breaksOne = Arrays.asList(0, 1); - double total = computeTotalSSE(columnOne, breaksOne); - assertEquals(0.0, total, 1e-10); + assertEquals(0.0, db.get(0, 0), 1e-9); + assertEquals(4.0, db.get(4, 0), 1e-9); + assertEquals(11.0, db.get(5, 0), 1e-9); + assertEquals(19.0, db.get(9, 0), 1e-9); } - @Test - public void testRegressSegmentEmpty() { - double[] column = {1.0, 2.0, 3.0}; - double[] result = regressSegment(column, 0, 0); - assertEquals(0.0, result[0], 1e-10); - assertEquals(0.0, result[1], 1e-10); + private ColGroupPiecewiseLinearCompressed createTestGroup(int numRows) { + int[][] bp = {{0, 5, numRows}}; + double[][] slopes = {{1.0, 3.0}}; + double[][] intercepts = {{0.0, 2.0}}; + return (ColGroupPiecewiseLinearCompressed) ColGroupPiecewiseLinearCompressed.create( + ColIndexFactory.create(new int[] {0}), bp, slopes, intercepts, numRows); } - @Test - public void testRegressSegmentSinglePoint() { - double[] column = {1.0, 2.0, 3.0}; - double[] result = regressSegment(column, 1, 2); + private double computeMSE(MatrixBlock orig, MatrixBlock recon) { + double sumSqErr = 0.0; + final int rows = orig.getNumRows(), cols = orig.getNumColumns(); + DenseBlock origDb = orig.getDenseBlock(); + DenseBlock reconDb = recon.getDenseBlock(); - assertEquals(0.0, result[0], 1e-10); - assertEquals(2.0, result[1], 1e-10); + for(int r = 0; r < rows; r++) + for(int c = 0; c < cols; c++) { + double diff = origDb.get(r, c) - reconDb.get(r, c); + sumSqErr += diff * diff; + } + return sumSqErr / (rows * cols); } @Test - public void testRegressSegmentTwoIdentical() { - double[] column = {5.0, 5.0, 1.0, 1.0}; - double[] result = regressSegment(column, 0, 2); + public void testDecompressRandomMultiCol() { + final int nrows = 50, ncols = 3; + double[][] origData = getRandomMatrix(nrows, ncols, -3, 3, 1.0, SEED); - assertEquals(0.0, result[0], 1e-10); - assertEquals(5.0, result[1], 1e-10); - } + int[] colArray = {0, 1, 2}; + IColIndex cols = ColIndexFactory.create(colArray); + CompressionSettings cs = new CompressionSettingsBuilder().create(); + cs.setPiecewiseTargetLoss(10.0); - @Test - public void testRegressSegmentTwoPoints() { - double[] column = {0.0, 2.0}; - double[] result = regressSegment(column, 0, 2); + MatrixBlock orig = DataConverter.convertToMatrixBlock(origData); + orig.allocateDenseBlock(); + + AColGroup cg = ColGroupFactory.compressPiecewiseLinearFunctional(cols, orig, cs); + ColGroupPiecewiseLinearCompressed pl = (ColGroupPiecewiseLinearCompressed) cg; - assertEquals(2.0, result[0], 1e-10); - assertEquals(0.0, result[1], 1e-10); + MatrixBlock recon = new MatrixBlock(nrows, ncols, false); + recon.allocateDenseBlock(); + pl.decompressToDenseBlock(recon.getDenseBlock(), 0, nrows, 0, ncols - 1); + + double mse = computeMSE(orig, recon); + assertTrue("MSE=" + mse + " > bound 20.0", mse <= 20.0); } @Test - public void testRegressSegmentTwoPointsOffset() { + public void testDecompressRandomSingleCol() { + final int nrows = 40, ncols = 1; + double[][] origData = getRandomMatrix(nrows, ncols, -2, 2, 1.0, SEED); + + IColIndex cols = ColIndexFactory.create(new int[] {0}); + CompressionSettings cs = new CompressionSettingsBuilder().create(); + cs.setPiecewiseTargetLoss(5.0); - double[] column = {1.0, 3.0, 5.0, 7.0}; - double[] result = regressSegment(column, 2, 4); + MatrixBlock orig = DataConverter.convertToMatrixBlock(origData); + orig.allocateDenseBlock(); - assertEquals(2.0, result[0], 1e-10); - assertEquals(1.0, result[1], 1e-10); - } + AColGroup cg = ColGroupFactory.compressPiecewiseLinearFunctional(cols, orig, cs); + ColGroupPiecewiseLinearCompressed pl = (ColGroupPiecewiseLinearCompressed) cg; - @Test - public void testRegressSegmentConstant() { - double[] column = {3.0, 3.0, 3.0, 3.0}; - double[] result = regressSegment(column, 0, 4); + MatrixBlock recon = new MatrixBlock(nrows, 1, false); + recon.allocateDenseBlock(); + pl.decompressToDenseBlock(recon.getDenseBlock(), 0, nrows, 0, 0); - assertEquals(0.0, result[0], 1e-10); - assertEquals(3.0, result[1], 1e-10); + double mse = computeMSE(orig, recon); + assertTrue("Single-Col MSE=" + mse + " > 8.0", mse <= 8.0); } @Test - public void testRegressSegmentLinear() { - double[] column = new double[4]; - double a = 1.5, b = 2.0; - for(int i = 0; i < 4; i++) { - column[i] = a * i + b; + public void testDecompressRandomTrend() { + final int nrows = 60, ncols = 2; + Random rng = new Random(SEED); + double[][] origData = new double[nrows][ncols]; + + for(int r = 0; r < nrows; r++) { + double trend = 0.03 * r; + for(int c = 0; c < ncols; c++) { + origData[r][c] = trend + rng.nextGaussian() * 1.2 + c * 1.5; + } } - double[] result = regressSegment(column, 0, 4); + int[] colArray = {0, 1}; + IColIndex cols = ColIndexFactory.create(colArray); - assertEquals(a, result[0], 1e-10); - assertEquals(b, result[1], 1e-10); - } + CompressionSettings cs = new CompressionSettingsBuilder().create(); + cs.setPiecewiseTargetLoss(8.0); + + MatrixBlock orig = DataConverter.convertToMatrixBlock(origData); + orig.allocateDenseBlock(); + AColGroup cg = ColGroupFactory.compressPiecewiseLinearFunctional(cols, orig, cs); + assertTrue(cg instanceof ColGroupPiecewiseLinearCompressed); + ColGroupPiecewiseLinearCompressed pl = (ColGroupPiecewiseLinearCompressed) cg; + + MatrixBlock recon = new MatrixBlock(nrows, ncols, false); + recon.allocateDenseBlock(); + pl.decompressToDenseBlock(recon.getDenseBlock(), 0, nrows, 0, ncols - 1); + + double mse = computeMSE(orig, recon); + assertTrue("Trend MSE=" + String.format("%.4f", mse) + " > bound 12.0", mse <= 12.0); + int[][] bp = pl.getBreakpointsPerCol(); + assertEquals(2, bp.length); + for(int c = 0; c < 2; c++) { + assertEquals(0, bp[c][0]); + assertEquals(nrows, bp[c][bp[c].length - 1]); + assertTrue(bp[c].length >= 2); + } + } @Test - public void testCompressPiecewiseLinearFunctionalConst() { - // 1. MatrixBlock mit einer konstanten Spalte erzeugen - int nrows = 20, ncols = 1; - MatrixBlock in = new MatrixBlock(nrows, ncols, false); - for(int r = 0; r < nrows; r++) - in.set(r, 0, 1.0); - // 2. colIndexes für Spalte 0 - IColIndex colIndexes = ColIndexFactory.create(new int[] {0}); - // 3. CompressionSettings mit TargetLoss + public void testDecompressRandomJumps() { + final int nrows = 50, ncols = 2; + double[][] origData = getRandomMatrix(nrows, ncols, -2, 2, 1.0, SEED); + + for(int c = 0; c < ncols; c++) { + for(int r = 20; r < 30; r++) + origData[r][c] += 2.0; + for(int r = 35; r < nrows; r++) + origData[r][c] += 7.0; + } + + int[] colArray = {0, 1}; + IColIndex cols = ColIndexFactory.create(colArray); + CompressionSettings cs = new CompressionSettingsBuilder().create(); - cs.setPiecewiseTargetLoss(1e-6); - // 4. Aufruf der Kompressionsfunktion - AColGroup result = ColGroupFactory.compressPiecewiseLinearFunctional(colIndexes, in, cs); + cs.setPiecewiseTargetLoss(12.0); - // 5. Ergebnis ist eine ColGroupPiecewiseLinearCompressed? - assertTrue(result instanceof ColGroupPiecewiseLinearCompressed); - ColGroupPiecewiseLinearCompressed plGroup = (ColGroupPiecewiseLinearCompressed) result; + MatrixBlock orig = DataConverter.convertToMatrixBlock(origData); + orig.allocateDenseBlock(); - // 6. Breakpoints per Getter, nicht per create() - int[] breakpoints = plGroup.getBreakpoints(); - assertArrayEquals(new int[] {0, 20}, breakpoints); + AColGroup cg = ColGroupFactory.compressPiecewiseLinearFunctional(cols, orig, cs); + assertTrue(cg instanceof ColGroupPiecewiseLinearCompressed); + ColGroupPiecewiseLinearCompressed pl = (ColGroupPiecewiseLinearCompressed) cg; - // 7. Pro Segment: 1 Segment → ein slope, ein intercept - double[] slopes = plGroup.getSlopes(); - double[] intercepts = plGroup.getIntercepts(); - assertEquals(1, slopes.length); - assertEquals(1, intercepts.length); + MatrixBlock recon = new MatrixBlock(nrows, ncols, false); + recon.allocateDenseBlock(); + pl.decompressToDenseBlock(recon.getDenseBlock(), 0, nrows, 0, ncols - 1); - // 8. Für konstante Daten: Steigung ~0, intercept ~1.0 - assertEquals(0.0, slopes[0], 1e-10); - assertEquals(1.0, intercepts[0], 1e-10); + double mse = computeMSE(orig, recon); + assertTrue("Jumps MSE=" + String.format("%.4f", mse) + " > bound 18.0", mse <= 18.0); - // 9. Check: colIndexes stimmt - IColIndex idx = plGroup.getColIndices(); - assertEquals(1, idx.size()); - assertEquals(0, idx.get(0)); + int[][] bp = pl.getBreakpointsPerCol(); + assertEquals(2, bp.length); + for(int c = 0; c < 2; c++) { + assertEquals(0, bp[c][0]); + assertEquals(nrows, bp[c][bp[c].length - 1]); + assertTrue(bp[c].length >= 3); + } } - @Test(expected = IllegalArgumentException.class) - public void testCreateNullBreakpoints() { - int[] nullBp = null; - ColGroupPiecewiseLinearCompressed.create(ColIndexFactory.create(new int[] {0}), nullBp, new double[] {1.0}, - new double[] {0.0}, 10); - } + private CompressedSizeInfo createTestCompressedSizeInfo() { + IColIndex cols = ColIndexFactory.create(new int[] {0}); + EstimationFactors facts = new EstimationFactors(2, 10); - @Test(expected = IllegalArgumentException.class) - public void testCreateTooFewBreakpoints() { - int[] singleBp = {0}; - ColGroupPiecewiseLinearCompressed.create(ColIndexFactory.create(new int[] {0}), singleBp, new double[] {1.0}, - new double[] {0.0}, 10); - } + CompressedSizeInfoColGroup info = new CompressedSizeInfoColGroup(cols, facts, + AColGroup.CompressionType.PiecewiseLinear); - @Test(expected = IllegalArgumentException.class) - public void testCreateInconsistentSlopes() { - int[] bp = {0, 5, 10}; - ColGroupPiecewiseLinearCompressed.create(ColIndexFactory.create(new int[] {0}), bp, - new double[] {1.0, 2.0, 3.0}, new double[] {0.0, 1.0}, 10); - } + List infos = Arrays.asList(info); + CompressedSizeInfo csi = new CompressedSizeInfo(infos); - @Test(expected = IllegalArgumentException.class) - public void testCreateInconsistentIntercepts() { - int[] bp = {0, 5, 10}; - ColGroupPiecewiseLinearCompressed.create(ColIndexFactory.create(new int[] {0}), bp, new double[] {1.0, 2.0}, - new double[] {0.0}, 10); + return csi; } @Test - public void testCreateValidMultiSegment() { - int[] bp = {0, 3, 7, 10}; - double[] slopes = {1.0, -2.0, 0.5}; - double[] intercepts = {0.0, 5.0, -1.0}; - IColIndex cols = ColIndexFactory.create(new int[] {0, 1}); + public void testCompressPiecewiseLinearViaRealAPI() { - AColGroup cg = ColGroupPiecewiseLinearCompressed.create(cols, bp, slopes, intercepts, 10); + MatrixBlock in = new MatrixBlock(10, 1, false); + in.allocateDenseBlock(); + for(int r = 0; r < 10; r++) { + in.set(r, 0, r * 0.5); + } - assertTrue(cg instanceof ColGroupPiecewiseLinearCompressed); - assertNotSame(bp, ((ColGroupPiecewiseLinearCompressed) cg).getBreakpoints()); - } + CompressionSettings cs = new CompressionSettingsBuilder().addValidCompression( + AColGroup.CompressionType.PiecewiseLinear).create(); - @Test - public void testCreateMultiColumn() { - IColIndex cols = ColIndexFactory.create(new int[] {5, 10, 15}); - int[] bp = {0, 5}; - double[] slopes = {3.0}; - double[] intercepts = {2.0}; + CompressedSizeInfo csi = createTestCompressedSizeInfo(); - AColGroup cg = ColGroupPiecewiseLinearCompressed.create(cols, bp, slopes, intercepts, 100); - assertTrue(cg instanceof ColGroupPiecewiseLinearCompressed); - assertTrue(cg instanceof ColGroupPiecewiseLinearCompressed); + List colGroups = ColGroupFactory.compressColGroups(in, csi, cs); - // - assertTrue(cg.getNumValues() > 0); + boolean hasPiecewise = colGroups.stream().anyMatch(cg -> cg instanceof ColGroupPiecewiseLinearCompressed); + assertTrue(hasPiecewise); + } - for(int r = 0; r < 5; r++) { - double expected = 3.0 * r + 2.0; - // colIdx=0 → globale Spalte 5 - assertEquals(expected, cg.getIdx(r, 0), 1e-9); - // colIdx=1 → globale Spalte 10 - assertEquals(expected, cg.getIdx(r, 1), 1e-9); - // colIdx=2 → globale Spalte 15 - assertEquals(expected, cg.getIdx(r, 2), 1e-9); - } + private double computeColumnMSE(MatrixBlock orig, MatrixBlock target, int col) { + final int numRows = orig.getNumRows(); + double totalSSE = 0.0; + final int origStride = orig.getNumColumns(); + final int tgtStride = target.getNumColumns(); - for(int r = 5; r < 10; r++) { - double expected = 3.0 * r + 2.0; - assertEquals(expected, cg.getIdx(r, 0), 1e-9); // Alle Columns gleich + for(int r = 0; r < numRows; r++) { + double origVal = orig.getDenseBlock().pos(r * origStride + col); + double tgtVal = target.getDenseBlock().pos(r * tgtStride + col); + totalSSE += (origVal - tgtVal) * (origVal - tgtVal); } - assertEquals(cols.size(), 3); + return totalSSE / numRows; } @Test - public void testCreateSingleColumn() { - IColIndex cols = ColIndexFactory.create(new int[] {5}); - int[] bp = {0, 5}; - double[] slopes = {3.0}; - double[] intercepts = {2.0}; - int numRows = 10; + public void testSukzessiveLinearColumnSingleSegment() { + double[] linearCol = {1.0, 2.0, 3.0, 4.0, 5.0}; + CompressionSettings cs = new CompressionSettingsBuilder().create(); + cs.setPiecewiseTargetLoss(1e-6); - AColGroup cg = ColGroupPiecewiseLinearCompressed.create(cols, bp, slopes, intercepts, numRows); + List breaks = PiecewiseLinearUtils.computeBreakpointSukzessive(linearCol, cs); + assertEquals("[0, 5]", breaks.toString()); + } - assertTrue(cg instanceof ColGroupPiecewiseLinearCompressed); + @Test + public void testSukzessiveNoisyColumnMultipleSegments() { + double[] noisyCol = {1.1, 1.9, 2.2, 10.1, 10.8, 11.3}; + CompressionSettings cs = new CompressionSettingsBuilder().create(); + cs.setPiecewiseTargetLoss(1.0); - assertEquals(2.0, cg.getIdx(0, 0), 1e-9); // 3*0 + 2 - assertEquals(5.0, cg.getIdx(1, 0), 1e-9); // 3*1 + 2 + List breaks = PiecewiseLinearUtils.computeBreakpointSukzessive(noisyCol, cs); + assertTrue(breaks.size() >= 3); } @Test - public void testCreateValidMinimal() { + public void testSukzessiveTargetLossIncreasesSegments() { + double[] colWithJumps = {1, 2, 3, 10, 11, 12, 20, 21, 22}; + CompressionSettings csStrict = new CompressionSettingsBuilder().create(); + csStrict.setPiecewiseTargetLoss(0.01); - // 1 Segment: [0,10] → y = 2.0 * r + 1.0 - int[] bp = {0, 10}; - double[] slopes = {2.0}; - double[] intercepts = {1.0}; - IColIndex cols = ColIndexFactory.create(new int[] {0}); - int numRows = 10; + CompressionSettings csLoose = new CompressionSettingsBuilder().create(); + csLoose.setPiecewiseTargetLoss(10.0); - AColGroup cg = ColGroupPiecewiseLinearCompressed.create(cols, bp, slopes, intercepts, numRows); + List strictBreaks = PiecewiseLinearUtils.computeBreakpointSukzessive(colWithJumps, csStrict); + List looseBreaks = PiecewiseLinearUtils.computeBreakpointSukzessive(colWithJumps, csLoose); - // Korrekte Instanz - assertTrue(cg instanceof ColGroupPiecewiseLinearCompressed); + assertTrue(strictBreaks.size() > looseBreaks.size()); + } - // getNumValues() > 0 - assertTrue(cg.getNumValues() > 0); + @Test + public void testMultiColumnTargetLossRespected() { + final int rows = 50, cols = 2; + double[][] data = getRandomMatrix(rows, cols, 0, 10, 1.0, 42L); + MatrixBlock orig = DataConverter.convertToMatrixBlock(data); + orig.allocateDenseBlock(); - // r < numRows - for(int r = 0; r < numRows; r++) { - double expected = 2.0 * r + 1.0; - assertEquals("Row " + r, expected, cg.getIdx(r, 0), 1e-9); - } + IColIndex colIdx = ColIndexFactory.create(0, cols - 1); + CompressionSettings cs = new CompressionSettingsBuilder().create(); + cs.setPiecewiseTargetLoss(1.0); + + AColGroup cg = ColGroupFactory.compressPiecewiseLinearFunctionalSukzessive(colIdx, orig, cs); - // Letzte gültige Row - assertEquals(19.0, cg.getIdx(9, 0), 1e-9); + MatrixBlock target = new MatrixBlock(rows, cols, false); + target.allocateDenseBlock(); + + cg.decompressToDenseBlock(target.getDenseBlock(), 0, rows - 1, 0, cols - 1); - //Out-of-Bounds korrekt 0.0 - assertEquals(0.0, cg.getIdx(10, 0), 1e-9); - assertEquals(0.0, cg.getIdx(9, 1), 1e-9); + for(int c = 0; c < cols; c++) { + double mse = computeColumnMSE(orig, target, c); + System.out.println("Col " + c + " MSE = " + mse); + assertTrue("Col " + c + " MSE=" + mse + " > target=1.0", mse <= 1.0 + 1e-10); + } } @Test - public void testDecompressToDenseBlock() { - int[] bp = {0, 5, 10}; - double[] slopes = {1.0, 2.0}; - double[] intercepts = {0.0, 1.0}; - int numRows = 10; + public void testMultiColumnRandomDecompressLoss() { + final int rows = 60, cols = 3; + double[][] origData = getRandomMatrix(rows, cols, -5, 5, 1.0, SEED); - AColGroup cg = ColGroupPiecewiseLinearCompressed.create(ColIndexFactory.create(new int[] {0}), bp, slopes, - intercepts, numRows); - - // 1. MatrixBlock mit korrekten Dimensionen - MatrixBlock target = new MatrixBlock(numRows, 1, false); + IColIndex colIdx = ColIndexFactory.create(0, cols - 1); + CompressionSettings cs = new CompressionSettingsBuilder().create(); + cs.setPiecewiseTargetLoss(8.0); - // 2. DenseBlock ZUERST alloziieren! - target.allocateDenseBlock(); // Oder target.allocateDenseBlock(true); + MatrixBlock orig = DataConverter.convertToMatrixBlock(origData); + orig.allocateDenseBlock(); - // 3. Jetzt DenseBlock verfügbar - DenseBlock db = target.getDenseBlock(); - assertNotNull(db); // Sicherstellen! + AColGroup cg = ColGroupFactory.compressPiecewiseLinearFunctionalSukzessive(colIdx, orig, cs); - // 4. Dekomprimieren - cg.decompressToDenseBlock(db, 0, numRows, 0, 0); + MatrixBlock target = new MatrixBlock(rows, cols, false); + target.allocateDenseBlock(); + cg.decompressToDenseBlock(target.getDenseBlock(), 0, rows, 0, cols - 1); - // 5. Prüfen - for(int r = 0; r < numRows; r++) { - double expected = (r < 5) ? 1.0 * r : 2.0 * r + 1.0; - assertEquals("Row " + r, expected, db.get(r, 0), 1e-9); + for(int c = 0; c < cols; c++) { + double mse = computeColumnMSE(orig, target, c); + assertTrue("Col " + c + " MSE=" + mse + " > bound 15.0", mse <= 15.0); } } - private ColGroupPiecewiseLinearCompressed createTestGroup(int numRows) { - int[] bp = {0, 5, numRows}; - double[] slopes = {1.0, 3.0}; - double[] intercepts = {0.0, 2.0}; - return (ColGroupPiecewiseLinearCompressed) ColGroupPiecewiseLinearCompressed.create( - ColIndexFactory.create(new int[] {0}), bp, slopes, intercepts, numRows); - } - @Test - public void testDecompressToDenseBlockFullRange() { - ColGroupPiecewiseLinearCompressed cg = createTestGroup(12); + public void testDecompressRandomTrendJumps() { + final int rows = 80, cols = 2; + Random rng = new Random(42L); + double[][] origData = new double[rows][cols]; - MatrixBlock target = new MatrixBlock(12, 1, false); - target.allocateDenseBlock(); - DenseBlock db = target.getDenseBlock(); + for(int r = 0; r < rows; r++) { + double trend = 0.04 * r; + for(int c = 0; c < cols; c++) { + origData[r][c] = trend + rng.nextGaussian() * 1.5; + if(r >= 25 && r < 45) + origData[r][c] += 6.0; + if(r >= 60) + origData[r][c] += 10.0; + } + } - cg.decompressToDenseBlock(db, 0, 12, 0, 0); + IColIndex colIdx = ColIndexFactory.create(0, cols - 1); + CompressionSettings cs = new CompressionSettingsBuilder().create(); + cs.setPiecewiseTargetLoss(10.0); - // Segment 0 [0,5): y = r - assertEquals(0.0, db.get(0, 0), 1e-9); - assertEquals(4.0, db.get(4, 0), 1e-9); + MatrixBlock orig = DataConverter.convertToMatrixBlock(origData); + orig.allocateDenseBlock(); - assertEquals(17.0, db.get(5, 0), 1e-9); - assertEquals(29.0, db.get(9, 0), 1e-9); - assertEquals(32.0, db.get(10, 0), 1e-9); - assertEquals(35.0, db.get(11, 0), 1e-9); + AColGroup cg = ColGroupFactory.compressPiecewiseLinearFunctionalSukzessive(colIdx, orig, cs); + MatrixBlock target = new MatrixBlock(rows, cols, false); + target.allocateDenseBlock(); + cg.decompressToDenseBlock(target.getDenseBlock(), 0, rows, 0, cols - 1); + + for(int c = 0; c < cols; c++) { + double mse = computeColumnMSE(orig, target, c); + assertTrue("Trend+Jumps Col " + c + ": MSE=" + mse + " > 20.0", mse <= 20.0); + } } @Test - public void testDecompressToDenseBlockPartialRange() { - ColGroupPiecewiseLinearCompressed cg = createTestGroup(12); + public void testDecompressRandomSingleColSukzessive() { + final int rows = 40; + Random rng = new Random(SEED); + double[] origCol = new double[rows]; - MatrixBlock target = new MatrixBlock(12, 1, false); - target.allocateDenseBlock(); - DenseBlock db = target.getDenseBlock(); + for(int r = 0; r < rows; r++) { + origCol[r] = 0.02 * r + rng.nextGaussian() * 0.8; + } - // rl=6, ru=9 → r=6,7,8 dekomprimieren - // offR=0 → schreibt in Target-Rows 6,7,8 - cg.decompressToDenseBlock(db, 6, 9, 0, 0); + double[][] origData = new double[rows][1]; + for(int r = 0; r < rows; r++) + origData[r][0] = origCol[r]; - assertEquals(0.0, db.get(0, 0), 1e-9); // Unberührt (vor rl=6) - assertEquals(20.0, db.get(6, 0), 1e-9); - assertEquals(23.0, db.get(7, 0), 1e-9); - assertEquals(26.0, db.get(8, 0), 1e-9); - assertEquals(0.0, db.get(9, 0), 1e-9); // Unberührt (nach ru=9) - } + IColIndex colIdx = ColIndexFactory.create(new int[] {0}); - @Test - public void testDecompressToDenseBlockEmptyRange() { - ColGroupPiecewiseLinearCompressed cg = createTestGroup(12); + CompressionSettings cs = new CompressionSettingsBuilder().create(); + cs.setPiecewiseTargetLoss(1.0); + + MatrixBlock orig = DataConverter.convertToMatrixBlock(origData); + orig.allocateDenseBlock(); + + AColGroup cg = ColGroupFactory.compressPiecewiseLinearFunctionalSukzessive(colIdx, orig, cs); + ColGroupPiecewiseLinearCompressed pl = (ColGroupPiecewiseLinearCompressed) cg; - MatrixBlock target = new MatrixBlock(5, 1, false); + MatrixBlock target = new MatrixBlock(rows, 1, false); target.allocateDenseBlock(); - DenseBlock db = target.getDenseBlock(); + pl.decompressToDenseBlock(target.getDenseBlock(), 0, rows, 0, 0); + + double mse = computeColumnMSE(orig, target, 0); + assertTrue("Single-Col MSE=" + mse + " > 3.0", mse <= 3.0); - // Leerer Bereich - cg.decompressToDenseBlock(db, 12, 12, 0, 0); // rl=ru - cg.decompressToDenseBlock(db, 3, 2, 0, 0); // rl>ru + int[][] bp = pl.getBreakpointsPerCol(); + assertEquals(1, bp.length); + assertEquals(0, bp[0][0]); + assertEquals(rows, bp[0][bp[0].length - 1]); + } - // Alles bleibt 0.0 - for(int r = 0; r < 5; r++) { - assertEquals(0.0, db.get(r, 0), 1e-9); + private boolean hasBreakInRange(int[] bps, int min, int max) { + for(int i = 1; i < bps.length - 1; i++) { + if(bps[i] >= min && bps[i] <= max) + return true; } + return false; } @Test - public void testDecompressToDenseBlockNullSafety() { - ColGroupPiecewiseLinearCompressed cg = createTestGroup(10); + public void testBreakpointsRandomJump() { + final int len = 30; + double[] col = getRandomColumn(len, SEED); - // Null DenseBlock - cg.decompressToDenseBlock(null, 0, 10, 0, 0); + for(int r = 10; r < 20; r++) + col[r] += 8.0; - // Ungültige Parameter (leerer Bereich) - MatrixBlock target = new MatrixBlock(10, 1, false); - target.allocateDenseBlock(); - DenseBlock db = target.getDenseBlock(); + CompressionSettings cs = new CompressionSettingsBuilder().create(); + cs.setPiecewiseTargetLoss(2.0); - cg.decompressToDenseBlock(db, 12, 12, 0, 0); // rl == ru - cg.decompressToDenseBlock(db, 5, 2, 0, 0); // rl > ru + List bps = computeBreakpointSukzessive(col, cs); + int[] bpsArray = bps.stream().mapToInt(Integer::intValue).toArray(); - // Target unverändert - for(int r = 0; r < 10; r++) { - assertEquals(0.0, db.get(r, 0), 1e-9); - } + assertTrue(" (Segs=" + bps.size() + ")", bps.size() >= 3); + assertTrue("No Break in Jump", hasBreakInRange(bpsArray, 8, 22)); } - private CompressedSizeInfo createTestCompressedSizeInfo() { - IColIndex cols = ColIndexFactory.create(new int[] {0}); - EstimationFactors facts = new EstimationFactors(2, 10); + @Test + public void testGlobalMSE_random() { + final int len = 40; + double[] col = getRandomColumn(len, SEED + 1); - CompressedSizeInfoColGroup info = new CompressedSizeInfoColGroup(cols, facts, - AColGroup.CompressionType.PiecewiseLinear); + CompressionSettings cs = new CompressionSettingsBuilder().create(); + cs.setPiecewiseTargetLoss(1.5); - List infos = Arrays.asList(info); - CompressedSizeInfo csi = new CompressedSizeInfo(infos); + List bps = computeBreakpointSukzessive(col, cs); + double totalSSE = 0.0; + for(int i = 0; i < bps.size() - 1; i++) { + totalSSE += computeSegmentCost(col, bps.get(i), bps.get(i + 1)); + } + double mse = totalSSE / col.length; - return csi; + assertTrue("Global MSE=" + mse + " > target=" + cs.getPiecewiseTargetLoss(), + mse <= cs.getPiecewiseTargetLoss() + 1e-10); + } + + private double[] getRandomColumn(int len, long seed) { + Random rng = new Random(seed); + double[] col = new double[len]; + for(int i = 0; i < len; i++) + col[i] = rng.nextGaussian() * 2 + i * 0.01; + return col; } @Test - public void testCompressPiecewiseLinearViaRealAPI() { + public void testGetExactSizeOnDiskRandom() { + Random rng = new Random(SEED); + final int nrows = 80 + rng.nextInt(40); - MatrixBlock in = new MatrixBlock(10, 1, false); - in.allocateDenseBlock(); - for(int r = 0; r < 10; r++) { - in.set(r, 0, r * 0.5); + int numSegs = 1 + rng.nextInt(3); + int[] bp = new int[numSegs + 1]; + bp[0] = 0; + bp[numSegs] = nrows; + for(int s = 1; s < numSegs; s++) + bp[s] = rng.nextInt(nrows * 2 / 3) + nrows / 10; + + double[] slopes = new double[numSegs]; + double[] intercepts = new double[numSegs]; + for(int s = 0; s < numSegs; s++) { + slopes[s] = rng.nextDouble() * 4 - 2; + intercepts[s] = rng.nextDouble() * 4 - 2; } - CompressionSettings cs = new CompressionSettingsBuilder().addValidCompression( - AColGroup.CompressionType.PiecewiseLinear).create(); + IColIndex cols = ColIndexFactory.create(new int[] {rng.nextInt(20)}); + int[][] bp2d = {bp}; + double[][] slopes2d = {slopes}; + double[][] ints2d = {intercepts}; - CompressedSizeInfo csi = createTestCompressedSizeInfo(); + AColGroup cg = ColGroupPiecewiseLinearCompressed.create(cols, bp2d, slopes2d, ints2d, nrows); - List colGroups = ColGroupFactory.compressColGroups(in, csi, cs); + long diskSize = cg.getExactSizeOnDisk(); + System.out.println("Single Random: nrows=" + nrows + ", segs=" + numSegs + ", size=" + diskSize); - boolean hasPiecewise = colGroups.stream().anyMatch(cg -> cg instanceof ColGroupPiecewiseLinearCompressed); - assertTrue(hasPiecewise); + assertTrue(diskSize > 0); + assertTrue(cg.getNumValues() > 0); } + @Test + public void testMultiColSizeRandom() { + Random rng = new Random(SEED + 1); + final int nrows = 100; + final int numGlobalCols = 3 + rng.nextInt(3); + int[] globalCols = new int[numGlobalCols]; + for(int i = 0; i < numGlobalCols; i++) + globalCols[i] = rng.nextInt(50) + i * 5; + + int[][] bp = new int[numGlobalCols][]; + double[][] slopes = new double[numGlobalCols][]; + double[][] intercepts = new double[numGlobalCols][]; + + for(int c = 0; c < numGlobalCols; c++) { + int numSegs = 1 + rng.nextInt(4); + bp[c] = new int[numSegs + 1]; + bp[c][0] = 0; + bp[c][numSegs] = nrows; + for(int s = 1; s < numSegs; s++) + bp[c][s] = rng.nextInt(nrows * 3 / 4) + nrows / 8; + + slopes[c] = new double[numSegs]; + intercepts[c] = new double[numSegs]; + for(int s = 0; s < numSegs; s++) { + slopes[c][s] = rng.nextDouble() * 3 - 1.5; + intercepts[c][s] = rng.nextDouble() * 3 - 1.5; + } + } - public void testGreedy_linearColumn_singleSegment() { - // 2. Perfekte Gerade → 1 Segment - double[] linearCol = {1.0, 2.0, 3.0, 4.0, 5.0}; // y=x+1 - CompressionSettings cs = new CompressionSettingsBuilder().create(); - cs.setPiecewiseTargetLoss(1e-6); + IColIndex cols = ColIndexFactory.create(globalCols); + AColGroup cg = ColGroupPiecewiseLinearCompressed.create(cols, bp, slopes, intercepts, nrows); + + if(cg instanceof ColGroupPiecewiseLinearCompressed) { + ColGroupPiecewiseLinearCompressed pl = (ColGroupPiecewiseLinearCompressed) cg; + + long diskSize = cg.getExactSizeOnDisk(); + System.out.println("Multi Random: cols=" + numGlobalCols + ", size=" + diskSize); + + assertEquals(numGlobalCols, cols.size()); + assertEquals(numGlobalCols, pl.getBreakpointsPerCol().length); + for(int c = 0; c < numGlobalCols; c++) { + assertEquals(nrows, pl.getBreakpointsPerCol()[c][pl.getBreakpointsPerCol()[c].length - 1]); + } + assertTrue(diskSize > 0); + } - List breaks = PiecewiseLinearUtils.computeBreakpointsGreedy(linearCol, cs); - assertEquals("[0, 5]", breaks.toString()); // SSE=0 ✓ } - @Test - public void testGreedy_noisyColumn_multipleSegments() { - // 3. Mit Sprung → 2 Segmente - double[] noisyCol = {1.1, 1.9, 2.2, 10.1, 10.8, 11.3}; // Sprung bei 3 - CompressionSettings cs = new CompressionSettingsBuilder().create(); - cs.setPiecewiseTargetLoss(1.0); // Erlaubt MSE=1 + private ColGroupPiecewiseLinearCompressed createTestColGroup() { + int[][] bps = {{0, 2, 6}, // Col 0: Seg1(len=2), Seg2(len=4) + {0, 3, 6} // Col 1: Seg1(len=3), Seg2(len=3) + }; + double[][] ints = {{1.0, 3.0}, // Col 0 intercepts + {2.0, 4.0} // Col 1 intercepts + }; + double[][] slps = {{0.5, 1.0}, // Col 0 slopes + {0.0, 2.0} // Col 1 slopes + }; + return new ColGroupPiecewiseLinearCompressed(ColIndexFactory.create(0, 2), bps, slps, ints, 6); + } - List breaks = PiecewiseLinearUtils.computeBreakpointsGreedy(noisyCol, cs); - // Erwartet mind. 2 Segmente (Sprung erkennen) - assertTrue(breaks.size() >= 3); // [0, ?, 6] + @Test + public void testComputeSum() { + ColGroupPiecewiseLinearCompressed cg = createTestColGroup(); + double[] c = new double[2]; + cg.computeSum(c, 6); + assertEquals(20.5, c[0], 1e-8); + assertEquals(24.0, c[1], 1e-8); } @Test - public void testGreedy_targetLossIncreasesSegments() { - // 4. Höherer Target-Loss → weniger Segmente - double[] colWithJumps = {1,2,3, 10,11,12, 20,21,22}; - CompressionSettings csStrict = new CompressionSettingsBuilder().create(); - csStrict.setPiecewiseTargetLoss(0.01); // Streng → viele Segmente + public void testComputeColSums() { + ColGroupPiecewiseLinearCompressed cg = createTestColGroup(); + double[] c = new double[2]; - CompressionSettings csLoose = new CompressionSettingsBuilder().create(); - csLoose.setPiecewiseTargetLoss(10.0); + cg.computeColSums(c, 6); + assertEquals(20.5, c[0], 1e-8); + assertEquals(24.0, c[1], 1e-8); + } - List strictBreaks = PiecewiseLinearUtils.computeBreakpointsGreedy(colWithJumps, csStrict); - List looseBreaks = PiecewiseLinearUtils.computeBreakpointsGreedy(colWithJumps, csLoose); + @Test + public void testSingleColumn() { + int[][] bps1 = {{0, 3}}; + double[][] ints1 = {{1.0}}; + double[][] slps1 = {{2.0}}; + ColGroupPiecewiseLinearCompressed cg1 = new ColGroupPiecewiseLinearCompressed(ColIndexFactory.create(0, 1), + bps1, slps1, ints1, 3); - // Strenger Target → mehr Segmente - assertTrue(strictBreaks.size() > looseBreaks.size()); - } + RightScalarOperator plus5 = new RightScalarOperator(Plus.getPlusFnObject(), 5.0); + AColGroup result = cg1.scalarOperation(plus5); + ColGroupPiecewiseLinearCompressed plResult = (ColGroupPiecewiseLinearCompressed) result; + assertEquals(6.0, plResult.getInterceptsPerCol()[0][0], 1e-8); + double[] origSum = new double[1]; + cg1.computeSum(origSum, 3); + double[] newSum = new double[1]; + ((ColGroupPiecewiseLinearCompressed) result).computeSum(newSum, 3); + assertEquals(origSum[0] + 5.0 * 3, newSum[0], 1e-8); + } @Test - public void testMultiColumnTargetLossRespected() { - final int rows = 50, cols = 2; - double[][] data = getRandomMatrix(rows, cols, 0, 10, 1.0, 42L); - MatrixBlock orig = DataConverter.convertToMatrixBlock(data); - orig.allocateDenseBlock(); - - IColIndex colIdx = ColIndexFactory.create(0, cols-1); - CompressionSettings cs = new CompressionSettingsBuilder().create(); - cs.setPiecewiseTargetLoss(1.0); + public void testScalarPlus() { + ColGroupPiecewiseLinearCompressed cg = createTestColGroup(); + RightScalarOperator plus2 = new RightScalarOperator(Plus.getPlusFnObject(), 2.0); + ColGroupPiecewiseLinearCompressed result = (ColGroupPiecewiseLinearCompressed) cg.scalarOperation(plus2); + ColGroupPiecewiseLinearCompressed plResult = (ColGroupPiecewiseLinearCompressed) result; - AColGroup cg = ColGroupFactory.compressPiecewiseLinearFunctional(colIdx, orig, cs); + assertArrayEquals(new double[] {0.5, 1.0}, plResult.getSlopesPerCol()[0], 1e-8); + assertArrayEquals(new double[] {0.0, 2.0}, plResult.getSlopesPerCol()[1], 1e-8); - MatrixBlock target = new MatrixBlock(rows, cols, false); - target.allocateDenseBlock(); - cg.decompressToDenseBlock(target.getDenseBlock(), 0, rows-1, 0, cols-1); + assertArrayEquals(new double[] {3.0, 5.0}, plResult.getInterceptsPerCol()[0], 1e-8); + assertArrayEquals(new double[] {4.0, 6.0}, plResult.getInterceptsPerCol()[1], 1e-8); - // Test MSE für jede Spalte - for (int c = 0; c < cols; c++) { - double mse = computeColumnMSE(orig, target, c); - assertTrue("Col " + c + " MSE=" + mse + " > target=1.0", mse <= 1.0); - } + double[] origSums = new double[2]; + cg.computeSum(origSums, 6); + double[] newSums = new double[2]; + result.computeSum(newSums, 6); + assertEquals(origSums[0] + 12.0, newSums[0], 1e-8); + assertEquals(origSums[1] + 12.0, newSums[1], 1e-8); } + @Test + public void testBinaryRowOpLeftMultiply() { + ColGroupPiecewiseLinearCompressed cg = createTestColGroup(); + double[] v = {3.0, 4.0}; + BinaryOperator mult = new BinaryOperator(Multiply.getMultiplyFnObject()); - private double computeColumnMSE(MatrixBlock orig, MatrixBlock reconstructed, int colIdx) { - double mse = 0.0; - final int numRows = orig.getNumRows(); - - DenseBlock origDb = orig.getDenseBlock(); - DenseBlock reconDb = reconstructed.getDenseBlock(); + AColGroup result = cg.binaryRowOpLeft(mult, v, false); - for (int row = 0; row < numRows; row++) { - final double origValue = origDb.get(row, colIdx); // ← DENSEBLOCK.GET! - final double reconValue = reconDb.get(row, colIdx); - final double squaredError = (origValue - reconValue) * (origValue - reconValue); - mse += squaredError; - } + double[] sums = new double[2]; + result.computeColSums(sums, 6); - return mse / numRows; + assertEquals(61.5, sums[0], 1e-8); + assertEquals(96.0, sums[1], 1e-8); } + @Test + public void testBinaryRowOpRightPlus() { + ColGroupPiecewiseLinearCompressed cg = createTestColGroup(); + double[] v = {1.0, 2.0}; + BinaryOperator plus = new BinaryOperator(Plus.getPlusFnObject()); + AColGroup result = cg.binaryRowOpRight(plus, v, false); + double[] sums = new double[2]; + result.computeColSums(sums, 6); + assertEquals(26.5, sums[0], 1e-8); + assertEquals(36.0, sums[1], 1e-8); + } + + @Test + public void testContainsValue() { + ColGroupPiecewiseLinearCompressed cg = createTestColGroup(); + assertTrue(cg.containsValue(1.0)); + assertTrue(cg.containsValue(2.0)); + assertTrue(cg.containsValue(1.5)); + assertFalse(cg.containsValue(999.0)); + assertFalse(cg.containsValue(0.0)); + } + @Test + public void testEdgeCases() { + ColGroupPiecewiseLinearCompressed cg = createTestColGroup(); + double[] c = new double[2]; + cg.computeSum(c, 6); + assertNotNull(cg.binaryRowOpLeft(new BinaryOperator(Plus.getPlusFnObject()), new double[] {0, 0}, true)); + } } + + From 7f29223f90a677b0019893f92a5590d38c2cd7e1 Mon Sep 17 00:00:00 2001 From: mori49 <84979219+mori49@users.noreply.github.com> Date: Thu, 12 Mar 2026 23:05:09 +0100 Subject: [PATCH 29/35] fix: put operation tests in a seperate file --- ...ecewiseLinearCompressedOperationsTest.java | 308 ++++++++++++++++++ 1 file changed, 308 insertions(+) create mode 100644 src/test/java/org/apache/sysds/test/component/compress/colgroup/ColGroupPiecewiseLinearCompressedOperationsTest.java diff --git a/src/test/java/org/apache/sysds/test/component/compress/colgroup/ColGroupPiecewiseLinearCompressedOperationsTest.java b/src/test/java/org/apache/sysds/test/component/compress/colgroup/ColGroupPiecewiseLinearCompressedOperationsTest.java new file mode 100644 index 00000000000..53ae3a1277c --- /dev/null +++ b/src/test/java/org/apache/sysds/test/component/compress/colgroup/ColGroupPiecewiseLinearCompressedOperationsTest.java @@ -0,0 +1,308 @@ +package org.apache.sysds.test.component.compress.colgroup; + +import org.apache.sysds.runtime.compress.CompressionSettings; +import org.apache.sysds.runtime.compress.CompressionSettingsBuilder; +import org.apache.sysds.runtime.compress.colgroup.AColGroup; +import org.apache.sysds.runtime.compress.colgroup.ColGroupFactory; +import org.apache.sysds.runtime.compress.colgroup.ColGroupPiecewiseLinearCompressed; +import org.apache.sysds.runtime.compress.colgroup.indexes.ColIndexFactory; +import org.apache.sysds.runtime.compress.colgroup.indexes.IColIndex; +import org.apache.sysds.runtime.functionobjects.Divide; +import org.apache.sysds.runtime.functionobjects.Minus; +import org.apache.sysds.runtime.functionobjects.Multiply; +import org.apache.sysds.runtime.functionobjects.Plus; +import org.apache.sysds.runtime.matrix.data.MatrixBlock; +import org.apache.sysds.runtime.matrix.operators.BinaryOperator; +import org.apache.sysds.runtime.matrix.operators.RightScalarOperator; +import org.apache.sysds.runtime.matrix.operators.ScalarOperator; +import org.apache.sysds.runtime.util.DataConverter; +import org.apache.sysds.test.AutomatedTestBase; +import org.junit.Before; +import org.junit.Test; + +import java.util.Random; + +import static org.junit.Assert.*; +import static org.junit.Assert.assertArrayEquals; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; + +/** + * Tests for ColGroupPiecewiseLinearCompressed operations containing: scalarOperation, binaryRowOps, computeSum, + * containsValue, getIdx, getExactSizeOnDisk. + */ +public class ColGroupPiecewiseLinearCompressedOperationsTest extends AutomatedTestBase { + + private static final long SEED = 42L; + private static final int NROWS = 50; + private static final int NCOLS = 3; + private static final double TARGET_LOSS = 1e-8; + private static final double DELTA = 1e-9; + + private ColGroupPiecewiseLinearCompressed piecewiseLinearColGroup; + private MatrixBlock orignalMB; + private MatrixBlock decompressedMB; + private IColIndex colIndexes; + private int numRows; + private int numCols; + + @Before + public void setUp() { + numRows = NROWS; + numCols = NCOLS; + + /// generate random matrix + double[][] data = getRandomMatrix(numRows, numCols, -3, 3, 1.0, SEED); + orignalMB = DataConverter.convertToMatrixBlock(data); + orignalMB.allocateDenseBlock(); + + colIndexes = ColIndexFactory.create(buildColArray(numCols)); + + CompressionSettings cs = new CompressionSettingsBuilder().create(); + cs.setPiecewiseTargetLoss(TARGET_LOSS); + + /// create ColGroupPiecewiseLinearCompressed instance + AColGroup result = ColGroupFactory.compressPiecewiseLinearFunctional(colIndexes, orignalMB, cs); + assertTrue(result instanceof ColGroupPiecewiseLinearCompressed); + piecewiseLinearColGroup = (ColGroupPiecewiseLinearCompressed) result; + + /// decompress again + decompressedMB = decompress(piecewiseLinearColGroup); + } + + private MatrixBlock decompress(AColGroup cg) { + MatrixBlock mb = new MatrixBlock(numRows, numCols, false); + mb.allocateDenseBlock(); + cg.decompressToDenseBlock(mb.getDenseBlock(), 0, numRows, 0, 0); + return mb; + } + + /// check elementwise to compare results from compressed and decompressed matrixblock + private void checkMatrixEquals(String msg, MatrixBlock mb1, MatrixBlock mb2) { + if(mb1.getNumRows() != mb2.getNumRows() || mb1.getNumColumns() != mb2.getNumColumns()) + fail(msg + " dimension mismatch"); + for(int r = 0; r < numRows; r++) + for(int c = 0; c < numCols; c++) + assertEquals(msg + "[" + r + "," + c + "]", mb1.get(r, c), mb2.get(r, c), DELTA); + } + + /// compute column sum to validate + private double[] computeSums(MatrixBlock mb) { + double[] sums = new double[numCols]; + for(int c = 0; c < numCols; c++) + for(int r = 0; r < numRows; r++) + sums[c] += mb.get(r, c); + return sums; + } + + /// create row vector + private double[] buildRowVector() { + double[] v = new double[numCols]; + for(int i = 0; i < numCols; i++) + v[i] = 0.5 * (i + 1); + return v; + } + + private int[] buildColArray(int n) { + int[] cols = new int[n]; + for(int i = 0; i < n; i++) + cols[i] = i; + return cols; + } + + private MatrixBlock applyBinaryRowOpLeft(MatrixBlock mb, BinaryOperator op, double[] v) { + MatrixBlock result = new MatrixBlock(numRows, numCols, false); + result.allocateDenseBlock(); + for(int r = 0; r < numRows; r++) + for(int c = 0; c < numCols; c++) + result.getDenseBlock().set(r, c, op.fn.execute(v[c], mb.get(r, c))); + return result; + } + + private MatrixBlock applyBinaryRowOpRight(MatrixBlock mb, BinaryOperator op, double[] v) { + MatrixBlock result = new MatrixBlock(numRows, numCols, false); + result.allocateDenseBlock(); + for(int r = 0; r < numRows; r++) + for(int c = 0; c < numCols; c++) + result.getDenseBlock().set(r, c, op.fn.execute(mb.get(r, c), v[c])); + return result; + } + + @Test + public void testComputeSum() { + double[] sumsComp = new double[numCols]; + piecewiseLinearColGroup.computeSum(sumsComp, numRows); + assertArrayEquals(sumsComp, computeSums(decompressedMB), DELTA); + } + + @Test + public void testComputeColSums() { + double[] sumsComp = new double[numCols]; + piecewiseLinearColGroup.computeColSums(sumsComp, numRows); + assertArrayEquals(sumsComp, computeSums(decompressedMB), DELTA); + } + + private void testScalarOp(ScalarOperator op, double scalar) { + MatrixBlock expected = new MatrixBlock(numRows, numCols, false); + expected.allocateDenseBlock(); + for(int r = 0; r < numRows; r++) + for(int c = 0; c < numCols; c++) + expected.getDenseBlock().set(r, c, op.fn.execute(decompressedMB.get(r, c), scalar)); + + checkMatrixEquals("scalarOp " + op.fn.getClass().getSimpleName(), expected, + decompress(piecewiseLinearColGroup.scalarOperation(op))); + } + + @Test + public void testScalarPlus() { + testScalarOp(new RightScalarOperator(Plus.getPlusFnObject(), 3.7), 3.7); + } + + @Test + public void testScalarMinus() { + testScalarOp(new RightScalarOperator(Minus.getMinusFnObject(), 1.5), 1.5); + } + + @Test + public void testScalarMultiply() { + testScalarOp(new RightScalarOperator(Multiply.getMultiplyFnObject(), 2.0), 2.0); + } + + @Test + public void testScalarDivide() { + testScalarOp(new RightScalarOperator(Divide.getDivideFnObject(), 4.0), 4.0); + } + + @Test + public void testBinaryRowOpLeftPlus() { + BinaryOperator op = new BinaryOperator(Plus.getPlusFnObject()); + double[] v = buildRowVector(); + checkMatrixEquals("binaryRowOpLeft Plus", applyBinaryRowOpLeft(decompressedMB, op, v), + decompress(piecewiseLinearColGroup.binaryRowOpLeft(op, v, false))); + } + + @Test + public void testBinaryRowOpLeftMultiply() { + BinaryOperator op = new BinaryOperator(Multiply.getMultiplyFnObject()); + double[] v = buildRowVector(); + checkMatrixEquals("binaryRowOpLeft Multiply", applyBinaryRowOpLeft(decompressedMB, op, v), + decompress(piecewiseLinearColGroup.binaryRowOpLeft(op, v, false))); + } + + @Test + public void testBinaryRowOpRightMinus() { + BinaryOperator op = new BinaryOperator(Minus.getMinusFnObject()); + double[] v = buildRowVector(); + checkMatrixEquals("binaryRowOpRight Minus", applyBinaryRowOpRight(decompressedMB, op, v), + decompress(piecewiseLinearColGroup.binaryRowOpRight(op, v, false))); + } + + @Test + public void testBinaryRowOpRightDivide() { + BinaryOperator op = new BinaryOperator(Divide.getDivideFnObject()); + double[] v = buildRowVector(); + checkMatrixEquals("binaryRowOpRight Divide", applyBinaryRowOpRight(decompressedMB, op, v), + decompress(piecewiseLinearColGroup.binaryRowOpRight(op, v, false))); + } + + @Test + public void testContainsValueIntercept() { + double pattern = piecewiseLinearColGroup.getInterceptsPerCol()[0][0]; + assertTrue("intercept of col 0 seg 0 should exist", piecewiseLinearColGroup.containsValue(pattern)); + } + + @Test + public void testContainsValueEndpoint() { + int[] breakpoints = piecewiseLinearColGroup.getBreakpointsPerCol()[0]; + double[] intercepts = piecewiseLinearColGroup.getInterceptsPerCol()[0]; + double[] slopes = piecewiseLinearColGroup.getSlopesPerCol()[0]; + if(breakpoints.length > 1) { + double pattern = intercepts[0] + slopes[0] * (breakpoints[1] - breakpoints[0] - 1); + assertTrue("endpoint of col 0 seg 0 should exist", piecewiseLinearColGroup.containsValue(pattern)); + } + } + + @Test + public void testContainsValueConstantSegment() { + ColGroupPiecewiseLinearCompressed cg = (ColGroupPiecewiseLinearCompressed) ColGroupPiecewiseLinearCompressed.create( + ColIndexFactory.create(new int[] {0}), new int[][] {{0, numRows}}, new double[][] {{0.0}}, + new double[][] {{1.23}}, numRows); + + assertTrue("constant value 1.23 should exist", cg.containsValue(1.23)); + assertFalse("value 2.0 should not exist", cg.containsValue(2.0)); + } + + @Test + public void testContainsValueOutsideRange() { + assertFalse("value -10 outside data range", piecewiseLinearColGroup.containsValue(-10.0)); + assertFalse("value +10 outside data range", piecewiseLinearColGroup.containsValue(10.0)); + } + + @Test + public void testGetIdxMatchesDecompress() { + for(int c = 0; c < numCols; c++) + for(int r = 0; r < numRows; r++) + assertEquals("getIdx(" + r + "," + c + ")", decompressedMB.get(r, c), + piecewiseLinearColGroup.getIdx(r, c), 1e-10); + } + + @Test + public void testGetIdxInvalidBounds() { + assertEquals("row < 0", 0.0, piecewiseLinearColGroup.getIdx(-1, 0), DELTA); + assertEquals("row >= numRows", 0.0, piecewiseLinearColGroup.getIdx(numRows, 0), DELTA); + assertEquals("col < 0", 0.0, piecewiseLinearColGroup.getIdx(0, -1), DELTA); + assertEquals("col >= ncols", 0.0, piecewiseLinearColGroup.getIdx(0, numCols), DELTA); + } + + @Test + public void testGetNumValues() { + int expected = 0; + for(int c = 0; c < numCols; c++) { + int breakpointsLen = piecewiseLinearColGroup.getBreakpointsPerCol()[c].length; + int slopesLen = piecewiseLinearColGroup.getSlopesPerCol()[c].length; + int interceptsLen = piecewiseLinearColGroup.getInterceptsPerCol()[c].length; + assertEquals("breakpoints != slopes+1 for col " + c, breakpointsLen, slopesLen + 1); + assertEquals("slopes != intercepts for col " + c, slopesLen, interceptsLen); + expected += breakpointsLen + slopesLen + interceptsLen; + } + assertEquals("getNumValues() mismatch", expected, piecewiseLinearColGroup.getNumValues()); + } + + @Test + public void testGetExactSizeOnDisk() { + Random rng = new Random(SEED); + int rows = 80 + rng.nextInt(40); + int numSegs = 1 + rng.nextInt(3); + + int[] breakpoints = new int[numSegs + 1]; + breakpoints[0] = 0; + breakpoints[numSegs] = rows; + for(int s = 1; s < numSegs; s++) + breakpoints[s] = rng.nextInt(rows * 2 / 3) + rows / 10; + + double[] slopes = new double[numSegs]; + double[] intercepts = new double[numSegs]; + for(int s = 0; s < numSegs; s++) { + slopes[s] = rng.nextDouble() * 4 - 2; + intercepts[s] = rng.nextDouble() * 4 - 2; + } + /// PLC Piecewise Linear Compressed + AColGroup colGroupPLC = ColGroupPiecewiseLinearCompressed.create( + ColIndexFactory.create(new int[] {rng.nextInt(20)}), new int[][] {breakpoints}, new double[][] {slopes}, + new double[][] {intercepts}, rows); + + assertTrue("disk size should be positive", colGroupPLC.getExactSizeOnDisk() > 0); + assertTrue("num values should be positive", colGroupPLC.getNumValues() > 0); + } + + @Override + public double[][] getRandomMatrix(int rows, int cols, double min, double max, double sparsity, long seed) { + Random rng = new Random(seed); + double[][] data = new double[rows][cols]; + for(int r = 0; r < rows; r++) + for(int c = 0; c < cols; c++) + data[r][c] = min + rng.nextDouble() * (max - min); + return data; + } +} From 0b0a4fba9bcd5e5153b78c26ea61d5ec79d1e795 Mon Sep 17 00:00:00 2001 From: mori49 <84979219+mori49@users.noreply.github.com> Date: Thu, 12 Mar 2026 23:07:40 +0100 Subject: [PATCH 30/35] fix: bugfixes in methods and documentation --- .../functional/PiecewiseLinearUtils.java | 194 ++++++++++++------ 1 file changed, 128 insertions(+), 66 deletions(-) diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/functional/PiecewiseLinearUtils.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/functional/PiecewiseLinearUtils.java index 5b67cba2173..7b0b4bfa960 100644 --- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/functional/PiecewiseLinearUtils.java +++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/functional/PiecewiseLinearUtils.java @@ -1,14 +1,17 @@ package org.apache.sysds.runtime.compress.colgroup.functional; - import org.apache.sysds.runtime.compress.CompressionSettings; import org.apache.sysds.runtime.matrix.data.MatrixBlock; - import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.List; public class PiecewiseLinearUtils { + /** + * Utility methods for piecewise linear compression of matric columns + * supports compression used the segmented least squares algorithm which is implemented with dynamic programming + * and a successive method, which puts all values in a segment till the target loss is exceeded + */ private PiecewiseLinearUtils() { @@ -71,10 +74,10 @@ public static SegmentedRegression compressSegmentedLeastSquares(double[] column, return new SegmentedRegression(breakpoints, slopes, intercepts); } - public static SegmentedRegression compressSukzessivePiecewiseLinear(double[] column, CompressionSettings cs) { + public static SegmentedRegression compressSuccessivePiecewiseLinear(double[] column, CompressionSettings cs) { //compute Breakpoints for a Column with a sukzessive breakpoints algorithm - final List breakpointsList = computeBreakpointSukzessive(column, cs); + final List breakpointsList = computeBreakpointSuccessive(column, cs); final int[] breakpoints = breakpointsList.stream().mapToInt(Integer::intValue).toArray(); //get values for Regression @@ -93,80 +96,122 @@ public static SegmentedRegression compressSukzessivePiecewiseLinear(double[] col return new SegmentedRegression(breakpoints, slopes, intercepts); } + /** + * Computes breakpoints for a column using segmented least squares with dynamic programming + * Iteratively reduces lambda to increase the number of segments until the target MSE is met. + * + * @param cs compression settings containing the target loss + * @param column the column values to segment + * @return list of breakpoint indices, starting with 0 + */ public static List computeBreakpoints(CompressionSettings cs, double[] column) { final int numElements = column.length; final double targetMSE = cs.getPiecewiseTargetLoss(); + final double sseMax = numElements * targetMSE; // max allowed total SSE - // max targetloss - final double sseMax = numElements * targetMSE; - double lambda = 1000.0; // Regulationparam + //start with high lambda an reduce iteratively + double lambda = Math.max(10.0, sseMax * 2.0); List bestBreaks = Arrays.asList(0, numElements); + double bestSSE = computeTotalSSE(column, bestBreaks); - for(int iter = 0; iter < 20; iter++) { // fixed Iterations + for (int iter = 0; iter < 50; iter++) { List breaks = computeBreakpointsLambda(column, lambda); double totalSSE = computeTotalSSE(column, breaks); + int numSegs = breaks.size() - 1; + + if (totalSSE < bestSSE) { + bestSSE = totalSSE; + bestBreaks = new ArrayList<>(breaks); + } + //target loss reached + if (bestSSE <= sseMax) { + return bestBreaks; + } - if(totalSSE <= sseMax) { - bestBreaks = breaks; + // only one segment left, break condition + if (numSegs <= 1) { + break; } + // reducing lambda to allow more segments in next iteration lambda *= 0.8; } return bestBreaks; } + /** + * Computes optimal breakpoints, each segment has a SEE plus a + + */ + public static List computeBreakpointsLambda(double[] column, double lambda) { - final int numRows = column.length; - final double[] costs = new double[numRows + 1]; //min Cost - final int[] prevStart = new int[numRows + 1]; //previous Start + final int n = column.length; + final double[] costs = new double[n + 1]; // min cost to reach i + final int[] prev = new int[n + 1]; + + Arrays.fill(costs, Double.POSITIVE_INFINITY); costs[0] = 0.0; - // Find Cost - for(int rowEnd = 1; rowEnd <= numRows; rowEnd++) { - costs[rowEnd] = Double.POSITIVE_INFINITY; - //Test all possible Segment to find the lowest costs - for(int rowStart = 0; rowStart < rowEnd; rowStart++) { - //costs per Segment = current costs + segmentloss + penaltiy - final double costCurrentSegment = computeSegmentCost(column, rowStart, rowEnd); - final double totalCost = costs[rowStart] + costCurrentSegment + lambda; - // Check if it is the better solution - if(totalCost < costs[rowEnd]) { - costs[rowEnd] = totalCost; - prevStart[rowEnd] = rowStart; + // precompute all segment costs to avoid recomputation in dynamic programming + double[][] segCosts = new double[n+1][n+1]; + for(int i = 0; i < n; i++) { + for(int j = i+1; j <= n; j++) { + segCosts[i][j] = computeSegmentCost(column, i, j); + } + } + // for each point j, find the cheapest previous breakpoint i + for(int j = 1; j <= n; j++) { + for(int i = 0; i < j; i++) { + // cost equals the SSE of segment [i,j] plus penalty plus best costs + double cost = costs[i] + segCosts[i][j] + lambda; + if(cost < costs[j]) { + costs[j] = cost; + prev[j] = i; } } } - //Check the optimal segmentlimits - final List segmentLimits = new ArrayList<>(); - int breakpointIndex = numRows; - while(breakpointIndex > 0) { - segmentLimits.add(breakpointIndex); - breakpointIndex = prevStart[breakpointIndex]; + + // Backtrack to previous points to recover the breakpoints + List breaks = new ArrayList<>(); + int j = n; + while(j > 0) { + breaks.add(j); + j = prev[j]; } - segmentLimits.add(0); - Collections.sort(segmentLimits); - return segmentLimits; + breaks.add(0); + Collections.reverse(breaks); + return breaks; } + /** + * computes the segment cost + * @param column column values + * @param start start index + * @param end end index + * @return SSE of the regression line over the segment + */ public static double computeSegmentCost(double[] column, int start, int end) { final int segSize = end - start; if(segSize <= 1) return 0.0; - final double[] ab = regressSegment(column, start, end); //Regressionline + final double[] ab = regressSegment(column, start, end); final double slope = ab[0]; final double intercept = ab[1]; - double sumSquaredError = 0.0; + double sse = 0.0; for(int i = start; i < end; i++) { - final double rowIdx = i; - final double actualValue = column[i]; - final double predictedValue = slope * rowIdx + intercept; - final double difference = actualValue - predictedValue; - sumSquaredError += difference * difference; + double err = column[i] - (slope * i + intercept); + sse += err * err; } - return sumSquaredError; + return sse; } + /** + * computes the total SSE over all segments defined by the given breakpoints + * @param column + * @param breaks + * @return sum of the total SSE + */ public static double computeTotalSSE(double[] column, List breaks) { double total = 0.0; for(int s = 0; s < breaks.size() - 1; s++) { @@ -184,61 +229,78 @@ public static double[] regressSegment(double[] column, int start, int end) { double sumOfRowIndices = 0, sumOfColumnValues = 0, sumOfRowIndicesSquared = 0, productRowIndexTimesColumnValue = 0; for(int i = start; i < end; i++) { - final double x = i; - final double y = column[i]; - sumOfRowIndices += x; - sumOfColumnValues += y; - sumOfRowIndicesSquared += x * x; - productRowIndexTimesColumnValue += x * y; + sumOfRowIndices += i; + sumOfColumnValues += column[i]; + sumOfRowIndicesSquared += i * i; + productRowIndexTimesColumnValue += i * column[i]; } - final double numPointsInSegmentDouble = numElements; + final double denominatorForSlope = - numPointsInSegmentDouble * sumOfRowIndicesSquared - sumOfRowIndices * sumOfRowIndices; + numElements * sumOfRowIndicesSquared - sumOfRowIndices * sumOfRowIndices; final double slope; final double intercept; if(denominatorForSlope == 0) { slope = 0.0; - intercept = sumOfColumnValues / numPointsInSegmentDouble; + intercept = sumOfColumnValues / numElements; } else { - slope = (numPointsInSegmentDouble * productRowIndexTimesColumnValue - sumOfRowIndices * sumOfColumnValues) / + slope = (numElements * productRowIndexTimesColumnValue - sumOfRowIndices * sumOfColumnValues) / denominatorForSlope; - intercept = (sumOfColumnValues - slope * sumOfRowIndices) / numPointsInSegmentDouble; + intercept = (sumOfColumnValues - slope * sumOfRowIndices) / numElements; } return new double[] {slope, intercept}; } - public static List computeBreakpointSukzessive(double[] column, CompressionSettings cs) { + /** + * computes breakpoints for a column using a successive algorithm + * extends each segment until the SEE reaches the target loss, then start a new segment + * @param column column values + * @param cs compression setting for setting the target loss + * @return list of breakpoint indices + */ + public static List computeBreakpointSuccessive(double[] column, CompressionSettings cs) { final int numElements = column.length; final double targetMSE = cs.getPiecewiseTargetLoss(); - if(Double.isNaN(targetMSE) || targetMSE <= 0) { - return Arrays.asList(0, numElements); // Fallback one Segment if targetloss is not valid + if (Double.isNaN(targetMSE) || targetMSE <= 0) { + return Arrays.asList(0, numElements); // fallback single segment } List breakpoints = new ArrayList<>(); - breakpoints.add(0); // first segment start is always 0 + breakpoints.add(0); int currentStart = 0; - while(currentStart < numElements) { - int bestEnd = numElements; - //Check all possible Ends for this one segment - for(int end = currentStart + 1; end <= numElements; end++) { + while (currentStart < numElements) { + int bestEnd = -1; // no end found + + for (int end = currentStart + 1; end <= numElements; end++) { double sse = computeSegmentCost(column, currentStart, end); - // Check if the loss for this segment is smaller/egual to the targetloss - double sseMax = (end - currentStart) * targetMSE; - if(sse > sseMax) { - bestEnd = end - 1; + if(sse > (end - currentStart) * targetMSE) { + // end-1 is last valid end; if end == segStart+1 force min segment of length 1 + bestEnd = (end == currentStart + 1) ? end : end - 1; break; } } + + if (bestEnd == -1) { + bestEnd = numElements;// all remaining points fitting within budget + } + + // safety guard not allow zero segments + if (bestEnd <= currentStart) { + bestEnd = Math.min(currentStart + 1, numElements); + } + breakpoints.add(bestEnd); currentStart = bestEnd; } - if(breakpoints.get(breakpoints.size() - 1) != numElements) { + // make sure, that the last breakpoint equals numElements + int last = breakpoints.get(breakpoints.size() - 1); + if (last != numElements) { breakpoints.add(numElements); } + return breakpoints; } } From d2014d276455afbf5abfe9a6d8aedb76aa4a214a Mon Sep 17 00:00:00 2001 From: mori49 <84979219+mori49@users.noreply.github.com> Date: Thu, 12 Mar 2026 23:09:22 +0100 Subject: [PATCH 31/35] fix: extract methods, refactor tests add: tests for DP and successive Compression and documentation --- ...ColGroupPiecewiseLinearCompressedTest.java | 1000 ++++------------- 1 file changed, 248 insertions(+), 752 deletions(-) diff --git a/src/test/java/org/apache/sysds/test/component/compress/colgroup/ColGroupPiecewiseLinearCompressedTest.java b/src/test/java/org/apache/sysds/test/component/compress/colgroup/ColGroupPiecewiseLinearCompressedTest.java index 1672da79704..e05745bc97d 100644 --- a/src/test/java/org/apache/sysds/test/component/compress/colgroup/ColGroupPiecewiseLinearCompressedTest.java +++ b/src/test/java/org/apache/sysds/test/component/compress/colgroup/ColGroupPiecewiseLinearCompressedTest.java @@ -12,11 +12,7 @@ import org.apache.sysds.runtime.compress.estim.CompressedSizeInfoColGroup; import org.apache.sysds.runtime.compress.estim.EstimationFactors; import org.apache.sysds.runtime.data.DenseBlock; -import org.apache.sysds.runtime.functionobjects.Multiply; -import org.apache.sysds.runtime.functionobjects.Plus; import org.apache.sysds.runtime.matrix.data.MatrixBlock; -import org.apache.sysds.runtime.matrix.operators.BinaryOperator; -import org.apache.sysds.runtime.matrix.operators.RightScalarOperator; import org.apache.sysds.runtime.util.DataConverter; import org.apache.sysds.test.AutomatedTestBase; import org.junit.Test; @@ -25,475 +21,317 @@ import java.util.List; import java.util.Random; -import static org.apache.sysds.runtime.compress.colgroup.functional.PiecewiseLinearUtils.*; import static org.junit.Assert.*; +/** + * Unit tests of ColGroupPiecewiseLinearCompression Covers Validation, Compression and decompression + */ public class ColGroupPiecewiseLinearCompressedTest extends AutomatedTestBase { + + private static final long SEED = 42L; + @Override public void setUp() { } - private static final long SEED = 42L; + @Test(expected = NullPointerException.class) + public void testCreateNullBreakpoints() { + IColIndex cols = ColIndexFactory.create(new int[] {0}); + int[][] nullBp = {null}; + ColGroupPiecewiseLinearCompressed.create(cols, nullBp, new double[][] {{1.0}}, new double[][] {{0.0}}, 10); + } + + @Test(expected = IllegalArgumentException.class) + public void testCreateTooFewBreakpoints() { + int[][] singleBp = {new int[] {0}}; + ColGroupPiecewiseLinearCompressed.create(ColIndexFactory.create(new int[] {0}), singleBp, + new double[][] {new double[] {1.0}}, new double[][] {new double[] {0.0}}, 10); + } + + @Test(expected = IllegalArgumentException.class) + public void testCreateInconsistentSlopes() { + int[] bp = {0, 5, 10}; + ColGroupPiecewiseLinearCompressed.create(ColIndexFactory.create(new int[] {0}), new int[][] {bp}, + new double[][] {new double[] {1.0, 2.0, 3.0}}, new double[][] {new double[] {0.0, 1.0}}, 10); + } + + @Test(expected = IllegalArgumentException.class) + public void testCreateInconsistentIntercepts() { + int[] bp = {0, 5, 10}; + ColGroupPiecewiseLinearCompressed.create(ColIndexFactory.create(new int[] {0}), new int[][] {bp}, + new double[][] {new double[] {1.0, 2.0}}, new double[][] {new double[] {0.0}}, 10); + } @Test - public void testCompressPiecewiseLinearFunctionalRandom() { - // Generate random data + public void testCompressAndDecompressDP() { + + // create random matrix final int nrows = 50, ncols = 3; double[][] data = getRandomMatrix(nrows, ncols, -3, 3, 1.0, SEED); MatrixBlock in = DataConverter.convertToMatrixBlock(data); in.allocateDenseBlock(); - // extract columns - double[][] columns = new double[ncols][nrows]; - for(int c = 0; c < ncols; c++) - for(int r = 0; r < nrows; r++) - columns[c][r] = data[r][c]; - - // create ColIndexes - int[] colArray = {0, 1, 2}; - IColIndex colIndexes = ColIndexFactory.create(colArray); - - // set targetloss + IColIndex colIndexes = ColIndexFactory.create(new int[] {0, 1, 2}); CompressionSettings cs = new CompressionSettingsBuilder().create(); - cs.setPiecewiseTargetLoss(25.0); + cs.setPiecewiseTargetLoss(1e-8); - // compress AColGroup result = ColGroupFactory.compressPiecewiseLinearFunctional(colIndexes, in, cs); assertTrue(result instanceof ColGroupPiecewiseLinearCompressed); ColGroupPiecewiseLinearCompressed plGroup = (ColGroupPiecewiseLinearCompressed) result; - // check structure - int[][] bp = plGroup.getBreakpointsPerCol(); - assertEquals(3, bp.length); // 3 Spalten - assertEquals(3, colIndexes.size()); - - for(int c = 0; c < ncols; c++) { - assertEquals(0, bp[c][0]); // start with 0 - assertEquals(nrows, bp[c][bp[c].length - 1]); - assertTrue(bp[c].length >= 2); // Mind. 1 Segment - } - + // check the structure + int[][] breakpoints = plGroup.getBreakpointsPerCol(); double[][] slopes = plGroup.getSlopesPerCol(); double[][] intercepts = plGroup.getInterceptsPerCol(); - assertEquals(3, slopes.length); + + assertEquals("wrong number of columns in breakpoints", ncols, breakpoints.length); for(int c = 0; c < ncols; c++) { - assertEquals(bp[c].length - 1, slopes[c].length); - assertEquals(bp[c].length - 1, intercepts[c].length); + assertTrue("breakpoints[" + c + "] needs at least 2 entries", breakpoints[c].length >= 2); + assertEquals("breakpoints[" + c + "] must start at 0", 0, breakpoints[c][0]); + assertEquals("breakpoints[" + c + "] must end at nrows", nrows, breakpoints[c][breakpoints[c].length - 1]); + int numSegs = breakpoints[c].length - 1; + assertEquals("slopes[" + c + "] length mismatch", numSegs, slopes[c].length); + assertEquals("intercepts[" + c + "] length mismatch", numSegs, intercepts[c].length); } - // check col indexes shouldnt change - assertEquals(3, plGroup.getColIndices().size()); - - // decompress + // decompress and check reconstruction of column group MatrixBlock recon = new MatrixBlock(nrows, ncols, false); recon.allocateDenseBlock(); - plGroup.decompressToDenseBlock(recon.getDenseBlock(), 0, nrows, 0, ncols - 1); - assertFalse(Double.isNaN(recon.get(0, 0))); + plGroup.decompressToDenseBlock(recon.getDenseBlock(), 0, nrows, 0, 0); + DenseBlock db = recon.getDenseBlock(); + + for(int r = 0; r < nrows; r++) { + for(int c = 0; c < ncols; c++) { + double val = db.get(r, c); + assertFalse("NaN at [" + r + "," + c + "]", Double.isNaN(val)); + assertFalse("Infinite at [" + r + "," + c + "]", Double.isInfinite(val)); + assertEquals("reconstruction error too large at [" + r + "," + c + "]", data[r][c], val, 1e-6); + } + } } - private void testCompressStructure(double[][] data) { - final int nrows = data.length, ncols = data[0].length; + @Test + public void testCompressAndDecompressSuccessive() { + + //create random matrix + final int nrows = 50, ncols = 3; + double[][] data = getRandomMatrix(nrows, ncols, -3, 3, 1.0, SEED); MatrixBlock in = DataConverter.convertToMatrixBlock(data); in.allocateDenseBlock(); - int[] colArray = new int[ncols]; - for(int i = 0; i < ncols; i++) - colArray[i] = i; - IColIndex colIndexes = ColIndexFactory.create(colArray); + IColIndex colIndexes = ColIndexFactory.create(new int[] {0, 1, 2}); CompressionSettings cs = new CompressionSettingsBuilder().create(); - cs.setPiecewiseTargetLoss(100.0); + cs.setPiecewiseTargetLoss(1e-8); - AColGroup result = ColGroupFactory.compressPiecewiseLinearFunctional(colIndexes, in, cs); + // create ColGroupPiecewiseLinearCompressed with successive compression + AColGroup result = ColGroupFactory.compressPiecewiseLinearFunctionalSuccessive(colIndexes, in, cs); assertTrue(result instanceof ColGroupPiecewiseLinearCompressed); ColGroupPiecewiseLinearCompressed plGroup = (ColGroupPiecewiseLinearCompressed) result; + // structure checks int[][] bp = plGroup.getBreakpointsPerCol(); - assertEquals(ncols, bp.length); - for(int c = 0; c < ncols; c++) { - assertEquals(0, bp[c][0]); - assertEquals(nrows, bp[c][bp[c].length - 1]); - } double[][] slopes = plGroup.getSlopesPerCol(); - assertEquals(ncols, slopes.length); + double[][] intercepts = plGroup.getInterceptsPerCol(); + + assertEquals("wrong number of columns in bp", ncols, bp.length); for(int c = 0; c < ncols; c++) { - assertEquals(bp[c].length - 1, slopes[c].length); + assertTrue("bp[" + c + "] needs at least 2 entries", bp[c].length >= 2); + assertEquals("bp[" + c + "] must start at 0", 0, bp[c][0]); + assertEquals("bp[" + c + "] must end at nrows", nrows, bp[c][bp[c].length - 1]); + int numSegs = bp[c].length - 1; + assertEquals("slopes[" + c + "] length mismatch", numSegs, slopes[c].length); + assertEquals("intercepts[" + c + "] length mismatch", numSegs, intercepts[c].length); } - assertEquals(ncols, plGroup.getColIndices().size()); + // validate decompression MatrixBlock recon = new MatrixBlock(nrows, ncols, false); recon.allocateDenseBlock(); - plGroup.decompressToDenseBlock(recon.getDenseBlock(), 0, nrows, 0, ncols - 1); - } - - @Test - public void testCompressTrendNoise() { - final int nrows = 100, ncols = 2; - Random rng = new Random(SEED); - double[][] data = new double[nrows][ncols]; + plGroup.decompressToDenseBlock(recon.getDenseBlock(), 0, nrows, 0, 0); + DenseBlock db = recon.getDenseBlock(); for(int r = 0; r < nrows; r++) { - double trend = 0.05 * r; for(int c = 0; c < ncols; c++) { - data[r][c] = trend + rng.nextGaussian() * 1.5 + c * 2.0; + double val = db.get(r, c); + assertFalse("NaN at [" + r + "," + c + "]", Double.isNaN(val)); + assertFalse("Infinite at [" + r + "," + c + "]", Double.isInfinite(val)); + assertEquals("reconstruction error too large at [" + r + "," + c + "]", data[r][c], val, 1e-6); } } - - testCompressStructure(data); } - @Test - public void testCompressJumps() { - final int nrows = 80, ncols = 3; - double[][] data = getRandomMatrix(nrows, ncols, -2, 2, 1.0, SEED); - for(int c = 0; c < ncols; c++) { - for(int r = 25; r < 55; r++) - data[r][c] += 8.0; - for(int r = 55; r < nrows; r++) - data[r][c] += 15.0; - } - testCompressStructure(data); - } + /// Wrapper-Classes: Test setup for DP and successive compression - @Test - public void testCompressHighFreq() { - final int nrows = 100, ncols = 50; - Random rng = new Random(SEED); - double[][] data = new double[nrows][ncols]; - for(int r = 0; r < nrows; r++) { - double sine = Math.sin(r * 0.4) * 4.0; - for(int c = 0; c < ncols; c++) { - data[r][c] = sine + rng.nextGaussian() * 0.8 + Math.sin(r * 0.2 + c) * 2.0; - } - } - testCompressStructure(data); + private void testRoundtripDP(double[][] data, int nrows, int ncols, double targetLoss, double tolerance, + int maxFailures) { + testRoundtrip(data, nrows, ncols, targetLoss, tolerance, maxFailures, false); } - @Test - public void testCompressSingleLowVariance() { - final int nrows = 50, ncols = 1; - double[][] data = getRandomMatrix(nrows, ncols, -1, 1, 1.0, SEED); - testCompressStructure(data); + private void testRoundtripSuccessive(double[][] data, int nrows, int ncols, double targetLoss, double tolerance, + int maxFailures) { + testRoundtrip(data, nrows, ncols, targetLoss, tolerance, maxFailures, true); } - @Test - public void testCompressSingleColumnStructure() { - double[][] data = getRandomMatrix(50, 1, -1, 1, 1.0, SEED); - testCompressStructure(data); - } + /** + * Set test setup: converting data in matrix block, set compression setting does compression, decompression, + * validation + */ + private void testRoundtrip(double[][] data, int nrows, int ncols, double targetLoss, double tolerance, + int maxFailures, boolean successive) { - @Test(expected = NullPointerException.class) // ← Dein realer Crash! - public void testCreateNullBreakpoints() { - IColIndex cols = ColIndexFactory.create(new int[] {0}); - int[][] nullBp = {null}; - ColGroupPiecewiseLinearCompressed.create(cols, nullBp, new double[][] {{1.0}}, new double[][] {{0.0}}, 10); - } + ///create a matrix + MatrixBlock orig = DataConverter.convertToMatrixBlock(data); + orig.allocateDenseBlock(); - @Test(expected = IllegalArgumentException.class) - public void testCreateTooFewBreakpoints() { - int[][] singleBp = {new int[] {0}}; - ColGroupPiecewiseLinearCompressed.create(ColIndexFactory.create(new int[] {0}), singleBp, - new double[][] {new double[] {1.0}}, new double[][] {new double[] {0.0}}, 10); - } + IColIndex colIndexes = ColIndexFactory.create(buildColArray(ncols)); + CompressionSettings cs = new CompressionSettingsBuilder().create(); + cs.setPiecewiseTargetLoss(targetLoss); - @Test(expected = IllegalArgumentException.class) - public void testCreateInconsistentSlopes() { - int[] bp = {0, 5, 10}; - ColGroupPiecewiseLinearCompressed.create(ColIndexFactory.create(new int[] {0}), new int[][] {bp}, - new double[][] {new double[] {1.0, 2.0, 3.0}}, new double[][] {new double[] {0.0, 1.0}}, 10); - } + /// choose compression + AColGroup result = successive ? ColGroupFactory.compressPiecewiseLinearFunctionalSuccessive(colIndexes, orig, + cs) : ColGroupFactory.compressPiecewiseLinearFunctional(colIndexes, orig, cs); - @Test(expected = IllegalArgumentException.class) - public void testCreateInconsistentIntercepts() { - int[] bp = {0, 5, 10}; - ColGroupPiecewiseLinearCompressed.create(ColIndexFactory.create(new int[] {0}), new int[][] {bp}, - new double[][] {new double[] {1.0, 2.0}}, new double[][] {new double[] {0.0}}, 10); - } + assertTrue(result instanceof ColGroupPiecewiseLinearCompressed); + ColGroupPiecewiseLinearCompressed plGroup = (ColGroupPiecewiseLinearCompressed) result; - private int findSegment(int[] bps, int r) { - for(int s = 0; s < bps.length - 1; s++) { - if(r < bps[s + 1]) - return s; - } - return bps.length - 2; - } + /// structure checks + checkStructure(plGroup, nrows, ncols); - @Test - public void testCreateValidMultiSegmentRandom() { - Random rng = new Random(SEED); - final int nrows = 20; + /// decompression check + MatrixBlock recon = new MatrixBlock(nrows, ncols, false); + recon.allocateDenseBlock(); + plGroup.decompressToDenseBlock(recon.getDenseBlock(), 0, nrows, 0, 0); + DenseBlock db = recon.getDenseBlock(); - int[][] bp = {{0, rng.nextInt(5) + 3, rng.nextInt(10) + 8, nrows}, {0, rng.nextInt(8) + 2, nrows}}; - double[][] slopes = {{rng.nextDouble() * 3 - 1.5, rng.nextDouble() * 3 - 1.5, rng.nextDouble() * 3 - 1.5}, - {rng.nextDouble() * 3 - 1.5, rng.nextDouble() * 3 - 1.5}}; - double[][] intercepts = {{rng.nextDouble() * 2 - 1, rng.nextDouble() * 2 - 1, rng.nextDouble() * 2 - 1}, - {rng.nextDouble() * 2 - 1, rng.nextDouble() * 2 - 1}}; + int failures = 0; + for(int r = 0; r < nrows; r++) { + for(int c = 0; c < ncols; c++) { + double val = db.get(r, c); + assertFalse("NaN at [" + r + "," + c + "]", Double.isNaN(val)); + assertFalse("Infinite at [" + r + "," + c + "]", Double.isInfinite(val)); + if(Math.abs(data[r][c] - val) > tolerance) + failures++; + } + } + assertTrue("too many reconstruction failures: " + failures, failures <= maxFailures); + } - IColIndex cols = ColIndexFactory.create(new int[] {0, 1}); - AColGroup cg = ColGroupPiecewiseLinearCompressed.create(cols, bp, slopes, intercepts, nrows); + private void checkStructure(ColGroupPiecewiseLinearCompressed plGroup, int nrows, int ncols) { + int[][] breakpoints = plGroup.getBreakpointsPerCol(); + double[][] slopes = plGroup.getSlopesPerCol(); + double[][] intercepts = plGroup.getInterceptsPerCol(); - assertTrue(cg instanceof ColGroupPiecewiseLinearCompressed); - ColGroupPiecewiseLinearCompressed pl = (ColGroupPiecewiseLinearCompressed) cg; - assertNotSame(bp, pl.getBreakpointsPerCol()); - assertEquals(2, pl.getBreakpointsPerCol().length); + assertEquals("wrong number of columns in breakpoints", ncols, breakpoints.length); + assertEquals("wrong number of col indices", ncols, plGroup.getColIndices().size()); - for(int c = 0; c < 2; c++) { - for(int r = 0; r < nrows; r++) { - int seg = findSegment(bp[c], r); - double expected = slopes[c][seg] * r + intercepts[c][seg]; - assertEquals(expected, cg.getIdx(r, c), 1e-8); - } + for(int c = 0; c < ncols; c++) { + assertTrue("breakpoints[" + c + "] needs at least 2 entries", breakpoints[c].length >= 2); + assertEquals("breakpoints[" + c + "] must start at 0", 0, breakpoints[c][0]); + assertEquals("breakpoints[" + c + "] must end at nrows", nrows, breakpoints[c][breakpoints[c].length - 1]); + int numSegs = breakpoints[c].length - 1; + assertEquals("slopes[" + c + "] length mismatch", numSegs, slopes[c].length); + assertEquals("intercepts[" + c + "] length mismatch", numSegs, intercepts[c].length); } } - @Test - public void testCreateMultiColumnRandom() { + private double[][] buildMultiSegmentData(int nrows, int ncols) { Random rng = new Random(SEED); - final int nrows = 80, numGlobalCols = 5; - int[] globalCols = {2, 7, 12, 25, 42}; - - int numSegs = rng.nextInt(3) + 1; - int[][] bp = new int[numGlobalCols][numSegs + 1]; - double[][] slopes = new double[numGlobalCols][numSegs]; - double[][] intercepts = new double[numGlobalCols][numSegs]; - - double slope = rng.nextDouble() * 4 - 2; - double intercept = rng.nextDouble() * 4 - 2; - for(int c = 0; c < numGlobalCols; c++) { - bp[c][0] = 0; - bp[c][numSegs] = nrows; - for(int s = 1; s < numSegs; s++) - bp[c][s] = rng.nextInt(nrows - 10) + 5; - Arrays.fill(slopes[c], slope); - Arrays.fill(intercepts[c], intercept); - } - - IColIndex cols = ColIndexFactory.create(globalCols); - AColGroup cg = ColGroupPiecewiseLinearCompressed.create(cols, bp, slopes, intercepts, nrows); - - assertTrue(cg.getNumValues() > 0); - assertEquals(numGlobalCols, cols.size()); + double[][] data = new double[nrows][ncols]; + int[] segStarts = {0, 15, 30, 45, 60}; + double[] slopes = {0.5, -1.2, 2.0, -0.8}; - for(int r = 0; r < nrows; r++) { - double expected = slope * r + intercept; - for(int localC = 0; localC < numGlobalCols; localC++) { - assertEquals(expected, cg.getIdx(r, localC), 1e-8); + for(int c = 0; c < ncols; c++) { + double offset = c; + for(int r = 0; r < nrows; r++) { + int seg = 0; + while(seg < segStarts.length - 1 && r >= segStarts[seg + 1]) + seg++; + data[r][c] = slopes[seg] * (r - segStarts[seg]) + offset + rng.nextGaussian() * 0.8; + offset += 0.01; } } + return data; + } + + private int[] buildColArray(int ncols) { + int[] cols = new int[ncols]; + for(int i = 0; i < ncols; i++) + cols[i] = i; + return cols; } @Test - public void testCreateSingleColumnRandom() { + public void testTrendWithNoise() { + final int nrows = 100, ncols = 2; Random rng = new Random(SEED); - final int nrows = rng.nextInt(30) + 20; - int numSegs = rng.nextInt(3) + 1; - - int[] bp = new int[numSegs + 1]; - bp[0] = 0; - bp[numSegs] = nrows; - for(int s = 1; s < numSegs; s++) - bp[s] = rng.nextInt(nrows / 2) + 5; - - double[] slopes = new double[numSegs]; - double[] intercepts = new double[numSegs]; - for(int s = 0; s < numSegs; s++) { - slopes[s] = rng.nextDouble() * 4 - 2; - intercepts[s] = rng.nextDouble() * 4 - 2; - } - - IColIndex cols = ColIndexFactory.create(new int[] {rng.nextInt(50)}); - int[][] bp2d = {bp}; - double[][] slopes2d = {slopes}; - double[][] ints2d = {intercepts}; - - AColGroup cg = ColGroupPiecewiseLinearCompressed.create(cols, bp2d, slopes2d, ints2d, nrows); - + double[][] data = new double[nrows][ncols]; for(int r = 0; r < nrows; r++) { - int seg = findSegment(bp, r); - double expected = slopes[seg] * r + intercepts[seg]; - assertEquals(expected, cg.getIdx(r, 0), 1e-8); + double trend = 0.05 * r; + for(int c = 0; c < ncols; c++) + data[r][c] = trend + rng.nextGaussian() * 1.5 + c * 2.0; } + testRoundtripDP(data, nrows, ncols, 1.0, 4.0, 45); + testRoundtripSuccessive(data, nrows, ncols, 1.0, 4.0, 45); } @Test - public void testDecompressToDenseBlock() { - int[][] bp = {{0, 5, 10}}; - double[][] slopes = {{1.0, 2.0}}; - double[][] intercepts = {{0.0, 1.0}}; - int numRows = 10; - - IColIndex cols = ColIndexFactory.create(new int[] {0}); - AColGroup cg = ColGroupPiecewiseLinearCompressed.create(cols, bp, slopes, intercepts, numRows); - - MatrixBlock target = new MatrixBlock(numRows, 1, false); - target.allocateDenseBlock(); - - DenseBlock db = target.getDenseBlock(); - assertNotNull("DenseBlock null?", db); - - cg.decompressToDenseBlock(db, 0, numRows, 0, 0); - - for(int r = 0; r < numRows; r++) { - double expected = (r < 5) ? (1.0 * r + 0.0) : (2.0 * r + 1.0); - assertEquals("Row " + r + " mismatch", expected, db.get(r, 0), 1e-9); + public void testAbruptJumps() { + final int nrows = 80, ncols = 3; + double[][] data = getRandomMatrix(nrows, ncols, -2, 2, 1.0, SEED); + for(int c = 0; c < ncols; c++) { + for(int r = 25; r < 55; r++) + data[r][c] += 8.0; + for(int r = 55; r < nrows; r++) + data[r][c] += 15.0; } - - assertEquals(0.0, db.get(0, 0), 1e-9); - assertEquals(4.0, db.get(4, 0), 1e-9); - assertEquals(11.0, db.get(5, 0), 1e-9); - assertEquals(19.0, db.get(9, 0), 1e-9); - } - - private ColGroupPiecewiseLinearCompressed createTestGroup(int numRows) { - int[][] bp = {{0, 5, numRows}}; - double[][] slopes = {{1.0, 3.0}}; - double[][] intercepts = {{0.0, 2.0}}; - return (ColGroupPiecewiseLinearCompressed) ColGroupPiecewiseLinearCompressed.create( - ColIndexFactory.create(new int[] {0}), bp, slopes, intercepts, numRows); + // successive needs looser tolerance on jumps + testRoundtripDP(data, nrows, ncols, 5.0, 10.0, 50); + testRoundtripSuccessive(data, nrows, ncols, 25.0, 18.0, 55); } - private double computeMSE(MatrixBlock orig, MatrixBlock recon) { - double sumSqErr = 0.0; - final int rows = orig.getNumRows(), cols = orig.getNumColumns(); - DenseBlock origDb = orig.getDenseBlock(); - DenseBlock reconDb = recon.getDenseBlock(); - - for(int r = 0; r < rows; r++) - for(int c = 0; c < cols; c++) { - double diff = origDb.get(r, c) - reconDb.get(r, c); - sumSqErr += diff * diff; - } - return sumSqErr / (rows * cols); + @Test + public void testHighFrequency() { + final int nrows = 100, ncols = 50; + Random rng = new Random(SEED); + double[][] data = new double[nrows][ncols]; + for(int r = 0; r < nrows; r++) { + double sine = Math.sin(r * 0.4) * 4.0; + for(int c = 0; c < ncols; c++) + data[r][c] = sine + rng.nextGaussian() * 0.8 + Math.sin(r * 0.2 + c) * 2.0; + } + // both struggle with high frequency; successive slightly worse + testRoundtripDP(data, nrows, ncols, 2.0, 2.0, 3500); + testRoundtripSuccessive(data, nrows, ncols, 2.0, 2.5, 2500); } @Test - public void testDecompressRandomMultiCol() { - final int nrows = 50, ncols = 3; - double[][] origData = getRandomMatrix(nrows, ncols, -3, 3, 1.0, SEED); - - int[] colArray = {0, 1, 2}; - IColIndex cols = ColIndexFactory.create(colArray); - CompressionSettings cs = new CompressionSettingsBuilder().create(); - cs.setPiecewiseTargetLoss(10.0); - - MatrixBlock orig = DataConverter.convertToMatrixBlock(origData); - orig.allocateDenseBlock(); - - AColGroup cg = ColGroupFactory.compressPiecewiseLinearFunctional(cols, orig, cs); - ColGroupPiecewiseLinearCompressed pl = (ColGroupPiecewiseLinearCompressed) cg; - - MatrixBlock recon = new MatrixBlock(nrows, ncols, false); - recon.allocateDenseBlock(); - pl.decompressToDenseBlock(recon.getDenseBlock(), 0, nrows, 0, ncols - 1); - - double mse = computeMSE(orig, recon); - assertTrue("MSE=" + mse + " > bound 20.0", mse <= 20.0); + public void testLowVarianceSingleColumn() { + double[][] data = getRandomMatrix(50, 1, -1, 1, 0.3, SEED); + testRoundtripDP(data, 50, 1, 0.1, 0.5, 5); + testRoundtripSuccessive(data, 50, 1, 0.05, 0.4, 3); } @Test - public void testDecompressRandomSingleCol() { - final int nrows = 40, ncols = 1; - double[][] origData = getRandomMatrix(nrows, ncols, -2, 2, 1.0, SEED); - - IColIndex cols = ColIndexFactory.create(new int[] {0}); - CompressionSettings cs = new CompressionSettingsBuilder().create(); - cs.setPiecewiseTargetLoss(5.0); - - MatrixBlock orig = DataConverter.convertToMatrixBlock(origData); - orig.allocateDenseBlock(); - - AColGroup cg = ColGroupFactory.compressPiecewiseLinearFunctional(cols, orig, cs); - ColGroupPiecewiseLinearCompressed pl = (ColGroupPiecewiseLinearCompressed) cg; - - MatrixBlock recon = new MatrixBlock(nrows, 1, false); - recon.allocateDenseBlock(); - pl.decompressToDenseBlock(recon.getDenseBlock(), 0, nrows, 0, 0); - - double mse = computeMSE(orig, recon); - assertTrue("Single-Col MSE=" + mse + " > 8.0", mse <= 8.0); + public void testSingleColumn() { + double[][] data = getRandomMatrix(50, 1, -1, 1, 1.0, SEED); + testRoundtripDP(data, 50, 1, 0.5, 1.0, 8); + testRoundtripSuccessive(data, 50, 1, 0.5, 1.0, 8); } @Test - public void testDecompressRandomTrend() { + public void testKnownSegmentBoundaries() { final int nrows = 60, ncols = 2; - Random rng = new Random(SEED); - double[][] origData = new double[nrows][ncols]; - - for(int r = 0; r < nrows; r++) { - double trend = 0.03 * r; - for(int c = 0; c < ncols; c++) { - origData[r][c] = trend + rng.nextGaussian() * 1.2 + c * 1.5; - } - } - - int[] colArray = {0, 1}; - IColIndex cols = ColIndexFactory.create(colArray); - - CompressionSettings cs = new CompressionSettingsBuilder().create(); - cs.setPiecewiseTargetLoss(8.0); - - MatrixBlock orig = DataConverter.convertToMatrixBlock(origData); - orig.allocateDenseBlock(); - - AColGroup cg = ColGroupFactory.compressPiecewiseLinearFunctional(cols, orig, cs); - assertTrue(cg instanceof ColGroupPiecewiseLinearCompressed); - ColGroupPiecewiseLinearCompressed pl = (ColGroupPiecewiseLinearCompressed) cg; - - MatrixBlock recon = new MatrixBlock(nrows, ncols, false); - recon.allocateDenseBlock(); - pl.decompressToDenseBlock(recon.getDenseBlock(), 0, nrows, 0, ncols - 1); - - double mse = computeMSE(orig, recon); - assertTrue("Trend MSE=" + String.format("%.4f", mse) + " > bound 12.0", mse <= 12.0); - - int[][] bp = pl.getBreakpointsPerCol(); - assertEquals(2, bp.length); - for(int c = 0; c < 2; c++) { - assertEquals(0, bp[c][0]); - assertEquals(nrows, bp[c][bp[c].length - 1]); - assertTrue(bp[c].length >= 2); - } + double[][] data = buildMultiSegmentData(nrows, ncols); + // successive needs slightly higher targetLoss for same data + testRoundtripDP(data, nrows, ncols, 0.8, 5.0, 35); + testRoundtripSuccessive(data, nrows, ncols, 1.0, 5.0, 35); } @Test - public void testDecompressRandomJumps() { - final int nrows = 50, ncols = 2; - double[][] origData = getRandomMatrix(nrows, ncols, -2, 2, 1.0, SEED); - - for(int c = 0; c < ncols; c++) { - for(int r = 20; r < 30; r++) - origData[r][c] += 2.0; - for(int r = 35; r < nrows; r++) - origData[r][c] += 7.0; - } - - int[] colArray = {0, 1}; - IColIndex cols = ColIndexFactory.create(colArray); - - CompressionSettings cs = new CompressionSettingsBuilder().create(); - cs.setPiecewiseTargetLoss(12.0); - - MatrixBlock orig = DataConverter.convertToMatrixBlock(origData); - orig.allocateDenseBlock(); - - AColGroup cg = ColGroupFactory.compressPiecewiseLinearFunctional(cols, orig, cs); - assertTrue(cg instanceof ColGroupPiecewiseLinearCompressed); - ColGroupPiecewiseLinearCompressed pl = (ColGroupPiecewiseLinearCompressed) cg; - - MatrixBlock recon = new MatrixBlock(nrows, ncols, false); - recon.allocateDenseBlock(); - pl.decompressToDenseBlock(recon.getDenseBlock(), 0, nrows, 0, ncols - 1); - - double mse = computeMSE(orig, recon); - assertTrue("Jumps MSE=" + String.format("%.4f", mse) + " > bound 18.0", mse <= 18.0); - - int[][] bp = pl.getBreakpointsPerCol(); - assertEquals(2, bp.length); - for(int c = 0; c < 2; c++) { - assertEquals(0, bp[c][0]); - assertEquals(nrows, bp[c][bp[c].length - 1]); - assertTrue(bp[c].length >= 3); - } + public void testMultipleColumns() { + double[][] data = getRandomMatrix(80, 5, -5, 5, 1.5, SEED); + testRoundtripDP(data, 80, 5, 3.0, 4.0, 120); + testRoundtripSuccessive(data, 80, 5, 3.0, 4.0, 120); } private CompressedSizeInfo createTestCompressedSizeInfo() { @@ -529,222 +367,81 @@ public void testCompressPiecewiseLinearViaRealAPI() { assertTrue(hasPiecewise); } - private double computeColumnMSE(MatrixBlock orig, MatrixBlock target, int col) { - final int numRows = orig.getNumRows(); - double totalSSE = 0.0; - final int origStride = orig.getNumColumns(); - final int tgtStride = target.getNumColumns(); - - for(int r = 0; r < numRows; r++) { - double origVal = orig.getDenseBlock().pos(r * origStride + col); - double tgtVal = target.getDenseBlock().pos(r * tgtStride + col); - totalSSE += (origVal - tgtVal) * (origVal - tgtVal); - } - return totalSSE / numRows; - } - @Test - public void testSukzessiveLinearColumnSingleSegment() { - double[] linearCol = {1.0, 2.0, 3.0, 4.0, 5.0}; + public void testSuccessiveLinearColumnSingleSegment() { + double[] col = {1.0, 2.0, 3.0, 4.0, 5.0}; CompressionSettings cs = new CompressionSettingsBuilder().create(); cs.setPiecewiseTargetLoss(1e-6); - List breaks = PiecewiseLinearUtils.computeBreakpointSukzessive(linearCol, cs); + List breaks = PiecewiseLinearUtils.computeBreakpointSuccessive(col, cs); assertEquals("[0, 5]", breaks.toString()); } @Test - public void testSukzessiveNoisyColumnMultipleSegments() { - double[] noisyCol = {1.1, 1.9, 2.2, 10.1, 10.8, 11.3}; + public void testSuccessiveNoisyColumnMultipleSegments() { + double[] col = {1.1, 1.9, 2.2, 10.1, 10.8, 11.3}; CompressionSettings cs = new CompressionSettingsBuilder().create(); cs.setPiecewiseTargetLoss(1.0); - List breaks = PiecewiseLinearUtils.computeBreakpointSukzessive(noisyCol, cs); - assertTrue(breaks.size() >= 3); + List breaks = PiecewiseLinearUtils.computeBreakpointSuccessive(col, cs); + assertTrue("expected at least 3 breakpoints", breaks.size() >= 3); } @Test - public void testSukzessiveTargetLossIncreasesSegments() { - double[] colWithJumps = {1, 2, 3, 10, 11, 12, 20, 21, 22}; - CompressionSettings csStrict = new CompressionSettingsBuilder().create(); - csStrict.setPiecewiseTargetLoss(0.01); - - CompressionSettings csLoose = new CompressionSettingsBuilder().create(); - csLoose.setPiecewiseTargetLoss(10.0); - - List strictBreaks = PiecewiseLinearUtils.computeBreakpointSukzessive(colWithJumps, csStrict); - List looseBreaks = PiecewiseLinearUtils.computeBreakpointSukzessive(colWithJumps, csLoose); - - assertTrue(strictBreaks.size() > looseBreaks.size()); - } - - @Test - public void testMultiColumnTargetLossRespected() { - final int rows = 50, cols = 2; - double[][] data = getRandomMatrix(rows, cols, 0, 10, 1.0, 42L); - MatrixBlock orig = DataConverter.convertToMatrixBlock(data); - orig.allocateDenseBlock(); - - IColIndex colIdx = ColIndexFactory.create(0, cols - 1); - CompressionSettings cs = new CompressionSettingsBuilder().create(); - cs.setPiecewiseTargetLoss(1.0); + public void testSuccessiveStrictLossProducesMoreSegments() { + double[] col = {1, 2, 3, 10, 11, 12, 20, 21, 22}; - AColGroup cg = ColGroupFactory.compressPiecewiseLinearFunctionalSukzessive(colIdx, orig, cs); + CompressionSettings strict = new CompressionSettingsBuilder().create(); + strict.setPiecewiseTargetLoss(0.01); - MatrixBlock target = new MatrixBlock(rows, cols, false); - target.allocateDenseBlock(); + CompressionSettings loose = new CompressionSettingsBuilder().create(); + loose.setPiecewiseTargetLoss(10.0); - cg.decompressToDenseBlock(target.getDenseBlock(), 0, rows - 1, 0, cols - 1); + List strictBreaks = PiecewiseLinearUtils.computeBreakpointSuccessive(col, strict); + List looseBreaks = PiecewiseLinearUtils.computeBreakpointSuccessive(col, loose); - for(int c = 0; c < cols; c++) { - double mse = computeColumnMSE(orig, target, c); - System.out.println("Col " + c + " MSE = " + mse); - assertTrue("Col " + c + " MSE=" + mse + " > target=1.0", mse <= 1.0 + 1e-10); - } + assertTrue("strict loss should produce more segments", strictBreaks.size() > looseBreaks.size()); } @Test - public void testMultiColumnRandomDecompressLoss() { - final int rows = 60, cols = 3; - double[][] origData = getRandomMatrix(rows, cols, -5, 5, 1.0, SEED); + public void testSuccessiveBreakpointDetectedAtJump() { + double[] col = getRandomColumn(30, SEED); + for(int r = 10; r < 20; r++) + col[r] += 8.0; - IColIndex colIdx = ColIndexFactory.create(0, cols - 1); CompressionSettings cs = new CompressionSettingsBuilder().create(); - cs.setPiecewiseTargetLoss(8.0); - - MatrixBlock orig = DataConverter.convertToMatrixBlock(origData); - orig.allocateDenseBlock(); - - AColGroup cg = ColGroupFactory.compressPiecewiseLinearFunctionalSukzessive(colIdx, orig, cs); + cs.setPiecewiseTargetLoss(2.0); - MatrixBlock target = new MatrixBlock(rows, cols, false); - target.allocateDenseBlock(); - cg.decompressToDenseBlock(target.getDenseBlock(), 0, rows, 0, cols - 1); + int[] bps = PiecewiseLinearUtils.computeBreakpointSuccessive(col, cs).stream().mapToInt(Integer::intValue) + .toArray(); - for(int c = 0; c < cols; c++) { - double mse = computeColumnMSE(orig, target, c); - assertTrue("Col " + c + " MSE=" + mse + " > bound 15.0", mse <= 15.0); - } + assertTrue("expected at least 3 segments", bps.length >= 3); + assertTrue("expected breakpoint near jump [10,20]", hasBreakInRange(bps, 8, 22)); } @Test - public void testDecompressRandomTrendJumps() { - final int rows = 80, cols = 2; - Random rng = new Random(42L); - double[][] origData = new double[rows][cols]; - - for(int r = 0; r < rows; r++) { - double trend = 0.04 * r; - for(int c = 0; c < cols; c++) { - origData[r][c] = trend + rng.nextGaussian() * 1.5; - if(r >= 25 && r < 45) - origData[r][c] += 6.0; - if(r >= 60) - origData[r][c] += 10.0; - } - } - - IColIndex colIdx = ColIndexFactory.create(0, cols - 1); + public void testSuccessiveGlobalMSEWithinTarget() { + double[] col = getRandomColumn(40, SEED + 1); CompressionSettings cs = new CompressionSettingsBuilder().create(); - cs.setPiecewiseTargetLoss(10.0); - - MatrixBlock orig = DataConverter.convertToMatrixBlock(origData); - orig.allocateDenseBlock(); - - AColGroup cg = ColGroupFactory.compressPiecewiseLinearFunctionalSukzessive(colIdx, orig, cs); - MatrixBlock target = new MatrixBlock(rows, cols, false); - target.allocateDenseBlock(); - cg.decompressToDenseBlock(target.getDenseBlock(), 0, rows, 0, cols - 1); - - for(int c = 0; c < cols; c++) { - double mse = computeColumnMSE(orig, target, c); - assertTrue("Trend+Jumps Col " + c + ": MSE=" + mse + " > 20.0", mse <= 20.0); - } - } - - @Test - public void testDecompressRandomSingleColSukzessive() { - final int rows = 40; - Random rng = new Random(SEED); - double[] origCol = new double[rows]; - - for(int r = 0; r < rows; r++) { - origCol[r] = 0.02 * r + rng.nextGaussian() * 0.8; - } - - double[][] origData = new double[rows][1]; - for(int r = 0; r < rows; r++) - origData[r][0] = origCol[r]; - - IColIndex colIdx = ColIndexFactory.create(new int[] {0}); - - CompressionSettings cs = new CompressionSettingsBuilder().create(); - cs.setPiecewiseTargetLoss(1.0); - - MatrixBlock orig = DataConverter.convertToMatrixBlock(origData); - orig.allocateDenseBlock(); - - AColGroup cg = ColGroupFactory.compressPiecewiseLinearFunctionalSukzessive(colIdx, orig, cs); - ColGroupPiecewiseLinearCompressed pl = (ColGroupPiecewiseLinearCompressed) cg; - - MatrixBlock target = new MatrixBlock(rows, 1, false); - target.allocateDenseBlock(); - pl.decompressToDenseBlock(target.getDenseBlock(), 0, rows, 0, 0); + cs.setPiecewiseTargetLoss(1.5); - double mse = computeColumnMSE(orig, target, 0); - assertTrue("Single-Col MSE=" + mse + " > 3.0", mse <= 3.0); + List bps = PiecewiseLinearUtils.computeBreakpointSuccessive(col, cs); + double sse = 0.0; + for(int i = 0; i < bps.size() - 1; i++) + sse += PiecewiseLinearUtils.computeSegmentCost(col, bps.get(i), bps.get(i + 1)); - int[][] bp = pl.getBreakpointsPerCol(); - assertEquals(1, bp.length); - assertEquals(0, bp[0][0]); - assertEquals(rows, bp[0][bp[0].length - 1]); + double mse = sse / col.length; + assertTrue("global MSE=" + mse + " exceeds target=" + cs.getPiecewiseTargetLoss(), + mse <= cs.getPiecewiseTargetLoss() + 1e-10); } private boolean hasBreakInRange(int[] bps, int min, int max) { - for(int i = 1; i < bps.length - 1; i++) { + for(int i = 1; i < bps.length - 1; i++) if(bps[i] >= min && bps[i] <= max) return true; - } return false; } - @Test - public void testBreakpointsRandomJump() { - final int len = 30; - double[] col = getRandomColumn(len, SEED); - - for(int r = 10; r < 20; r++) - col[r] += 8.0; - - CompressionSettings cs = new CompressionSettingsBuilder().create(); - cs.setPiecewiseTargetLoss(2.0); - - List bps = computeBreakpointSukzessive(col, cs); - int[] bpsArray = bps.stream().mapToInt(Integer::intValue).toArray(); - - assertTrue(" (Segs=" + bps.size() + ")", bps.size() >= 3); - assertTrue("No Break in Jump", hasBreakInRange(bpsArray, 8, 22)); - } - - @Test - public void testGlobalMSE_random() { - final int len = 40; - double[] col = getRandomColumn(len, SEED + 1); - - CompressionSettings cs = new CompressionSettingsBuilder().create(); - cs.setPiecewiseTargetLoss(1.5); - - List bps = computeBreakpointSukzessive(col, cs); - double totalSSE = 0.0; - for(int i = 0; i < bps.size() - 1; i++) { - totalSSE += computeSegmentCost(col, bps.get(i), bps.get(i + 1)); - } - double mse = totalSSE / col.length; - - assertTrue("Global MSE=" + mse + " > target=" + cs.getPiecewiseTargetLoss(), - mse <= cs.getPiecewiseTargetLoss() + 1e-10); - } - private double[] getRandomColumn(int len, long seed) { Random rng = new Random(seed); double[] col = new double[len]; @@ -753,207 +450,6 @@ private double[] getRandomColumn(int len, long seed) { return col; } - @Test - public void testGetExactSizeOnDiskRandom() { - Random rng = new Random(SEED); - final int nrows = 80 + rng.nextInt(40); - - int numSegs = 1 + rng.nextInt(3); - int[] bp = new int[numSegs + 1]; - bp[0] = 0; - bp[numSegs] = nrows; - for(int s = 1; s < numSegs; s++) - bp[s] = rng.nextInt(nrows * 2 / 3) + nrows / 10; - - double[] slopes = new double[numSegs]; - double[] intercepts = new double[numSegs]; - for(int s = 0; s < numSegs; s++) { - slopes[s] = rng.nextDouble() * 4 - 2; - intercepts[s] = rng.nextDouble() * 4 - 2; - } - - IColIndex cols = ColIndexFactory.create(new int[] {rng.nextInt(20)}); - int[][] bp2d = {bp}; - double[][] slopes2d = {slopes}; - double[][] ints2d = {intercepts}; - - AColGroup cg = ColGroupPiecewiseLinearCompressed.create(cols, bp2d, slopes2d, ints2d, nrows); - - long diskSize = cg.getExactSizeOnDisk(); - System.out.println("Single Random: nrows=" + nrows + ", segs=" + numSegs + ", size=" + diskSize); - - assertTrue(diskSize > 0); - assertTrue(cg.getNumValues() > 0); - } - - @Test - public void testMultiColSizeRandom() { - Random rng = new Random(SEED + 1); - final int nrows = 100; - final int numGlobalCols = 3 + rng.nextInt(3); - int[] globalCols = new int[numGlobalCols]; - for(int i = 0; i < numGlobalCols; i++) - globalCols[i] = rng.nextInt(50) + i * 5; - - int[][] bp = new int[numGlobalCols][]; - double[][] slopes = new double[numGlobalCols][]; - double[][] intercepts = new double[numGlobalCols][]; - - for(int c = 0; c < numGlobalCols; c++) { - int numSegs = 1 + rng.nextInt(4); - bp[c] = new int[numSegs + 1]; - bp[c][0] = 0; - bp[c][numSegs] = nrows; - for(int s = 1; s < numSegs; s++) - bp[c][s] = rng.nextInt(nrows * 3 / 4) + nrows / 8; - - slopes[c] = new double[numSegs]; - intercepts[c] = new double[numSegs]; - for(int s = 0; s < numSegs; s++) { - slopes[c][s] = rng.nextDouble() * 3 - 1.5; - intercepts[c][s] = rng.nextDouble() * 3 - 1.5; - } - } - - IColIndex cols = ColIndexFactory.create(globalCols); - AColGroup cg = ColGroupPiecewiseLinearCompressed.create(cols, bp, slopes, intercepts, nrows); - - if(cg instanceof ColGroupPiecewiseLinearCompressed) { - ColGroupPiecewiseLinearCompressed pl = (ColGroupPiecewiseLinearCompressed) cg; - - long diskSize = cg.getExactSizeOnDisk(); - System.out.println("Multi Random: cols=" + numGlobalCols + ", size=" + diskSize); - - assertEquals(numGlobalCols, cols.size()); - assertEquals(numGlobalCols, pl.getBreakpointsPerCol().length); - for(int c = 0; c < numGlobalCols; c++) { - assertEquals(nrows, pl.getBreakpointsPerCol()[c][pl.getBreakpointsPerCol()[c].length - 1]); - } - assertTrue(diskSize > 0); - } - - } - - private ColGroupPiecewiseLinearCompressed createTestColGroup() { - int[][] bps = {{0, 2, 6}, // Col 0: Seg1(len=2), Seg2(len=4) - {0, 3, 6} // Col 1: Seg1(len=3), Seg2(len=3) - }; - double[][] ints = {{1.0, 3.0}, // Col 0 intercepts - {2.0, 4.0} // Col 1 intercepts - }; - double[][] slps = {{0.5, 1.0}, // Col 0 slopes - {0.0, 2.0} // Col 1 slopes - }; - return new ColGroupPiecewiseLinearCompressed(ColIndexFactory.create(0, 2), bps, slps, ints, 6); - } - - @Test - public void testComputeSum() { - ColGroupPiecewiseLinearCompressed cg = createTestColGroup(); - double[] c = new double[2]; - cg.computeSum(c, 6); - assertEquals(20.5, c[0], 1e-8); - assertEquals(24.0, c[1], 1e-8); - } - - @Test - public void testComputeColSums() { - ColGroupPiecewiseLinearCompressed cg = createTestColGroup(); - double[] c = new double[2]; - - cg.computeColSums(c, 6); - assertEquals(20.5, c[0], 1e-8); - assertEquals(24.0, c[1], 1e-8); - } - - @Test - public void testSingleColumn() { - int[][] bps1 = {{0, 3}}; - double[][] ints1 = {{1.0}}; - double[][] slps1 = {{2.0}}; - ColGroupPiecewiseLinearCompressed cg1 = new ColGroupPiecewiseLinearCompressed(ColIndexFactory.create(0, 1), - bps1, slps1, ints1, 3); - - RightScalarOperator plus5 = new RightScalarOperator(Plus.getPlusFnObject(), 5.0); - AColGroup result = cg1.scalarOperation(plus5); - - ColGroupPiecewiseLinearCompressed plResult = (ColGroupPiecewiseLinearCompressed) result; - assertEquals(6.0, plResult.getInterceptsPerCol()[0][0], 1e-8); - double[] origSum = new double[1]; - cg1.computeSum(origSum, 3); - double[] newSum = new double[1]; - ((ColGroupPiecewiseLinearCompressed) result).computeSum(newSum, 3); - assertEquals(origSum[0] + 5.0 * 3, newSum[0], 1e-8); - } - - @Test - public void testScalarPlus() { - ColGroupPiecewiseLinearCompressed cg = createTestColGroup(); - RightScalarOperator plus2 = new RightScalarOperator(Plus.getPlusFnObject(), 2.0); - ColGroupPiecewiseLinearCompressed result = (ColGroupPiecewiseLinearCompressed) cg.scalarOperation(plus2); - ColGroupPiecewiseLinearCompressed plResult = (ColGroupPiecewiseLinearCompressed) result; - - assertArrayEquals(new double[] {0.5, 1.0}, plResult.getSlopesPerCol()[0], 1e-8); - assertArrayEquals(new double[] {0.0, 2.0}, plResult.getSlopesPerCol()[1], 1e-8); - - assertArrayEquals(new double[] {3.0, 5.0}, plResult.getInterceptsPerCol()[0], 1e-8); - assertArrayEquals(new double[] {4.0, 6.0}, plResult.getInterceptsPerCol()[1], 1e-8); - - double[] origSums = new double[2]; - cg.computeSum(origSums, 6); - double[] newSums = new double[2]; - result.computeSum(newSums, 6); - assertEquals(origSums[0] + 12.0, newSums[0], 1e-8); - assertEquals(origSums[1] + 12.0, newSums[1], 1e-8); - } - - @Test - public void testBinaryRowOpLeftMultiply() { - ColGroupPiecewiseLinearCompressed cg = createTestColGroup(); - double[] v = {3.0, 4.0}; - BinaryOperator mult = new BinaryOperator(Multiply.getMultiplyFnObject()); - - AColGroup result = cg.binaryRowOpLeft(mult, v, false); - - double[] sums = new double[2]; - result.computeColSums(sums, 6); - - assertEquals(61.5, sums[0], 1e-8); - assertEquals(96.0, sums[1], 1e-8); - } - - @Test - public void testBinaryRowOpRightPlus() { - ColGroupPiecewiseLinearCompressed cg = createTestColGroup(); - double[] v = {1.0, 2.0}; - BinaryOperator plus = new BinaryOperator(Plus.getPlusFnObject()); - - AColGroup result = cg.binaryRowOpRight(plus, v, false); - - double[] sums = new double[2]; - result.computeColSums(sums, 6); - assertEquals(26.5, sums[0], 1e-8); - assertEquals(36.0, sums[1], 1e-8); - } - - @Test - public void testContainsValue() { - ColGroupPiecewiseLinearCompressed cg = createTestColGroup(); - assertTrue(cg.containsValue(1.0)); - assertTrue(cg.containsValue(2.0)); - assertTrue(cg.containsValue(1.5)); - assertFalse(cg.containsValue(999.0)); - assertFalse(cg.containsValue(0.0)); - } - - @Test - public void testEdgeCases() { - ColGroupPiecewiseLinearCompressed cg = createTestColGroup(); - double[] c = new double[2]; - cg.computeSum(c, 6); - assertNotNull(cg.binaryRowOpLeft(new BinaryOperator(Plus.getPlusFnObject()), new double[] {0, 0}, true)); - } - } From fc908982505f659c66bcddab64077c118ed9cd76 Mon Sep 17 00:00:00 2001 From: mori49 <84979219+mori49@users.noreply.github.com> Date: Thu, 12 Mar 2026 23:10:11 +0100 Subject: [PATCH 32/35] fix: refactoring and documentation --- .../ColGroupPiecewiseLinearCompressed.java | 329 +++++++++++------- 1 file changed, 210 insertions(+), 119 deletions(-) diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupPiecewiseLinearCompressed.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupPiecewiseLinearCompressed.java index d6ad0c6c421..f05a5d46e79 100644 --- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupPiecewiseLinearCompressed.java +++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupPiecewiseLinearCompressed.java @@ -19,8 +19,16 @@ import java.util.Arrays; -public class ColGroupPiecewiseLinearCompressed extends AColGroupCompressed { +/** + * This class represents a new ColGroup which is compresses column into segments (piecewise linear) to represent the + * original Data each column is approximate by a set of linear segments defined by breakpoints, slopes and intercepts + */ +public class ColGroupPiecewiseLinearCompressed extends AColGroupCompressed { + /** + * breakpoints indices per column to define the segment boundaries slopes of the regression line per segment per + * column intercepts of the regression line per segment per column + */ int[][] breakpointsPerCol; double[][] slopesPerCol; double[][] interceptsPerCol; @@ -34,21 +42,31 @@ public ColGroupPiecewiseLinearCompressed(IColIndex colIndices, int[][] breakpoin double[][] intercepts, int numRows) { super(colIndices); this.breakpointsPerCol = breakpoints; - this.slopesPerCol = slopes; - this.interceptsPerCol = intercepts; + this.slopesPerCol = slopes.clone(); + this.interceptsPerCol = intercepts.clone(); this.numRows = numRows; } + /** + * creates a new piecewise linear compress column group validates inputs and copies all arrays before storing + * + * @param colIndices the column indices this group represents + * @param breakpointsPerCol breakpoint indices per column + * @param slopesPerCol slope of each segment per column + * @param interceptsPerCol intercept of each segment per column + * @param numRows number of rows in the original matrix + * @return a new ColGroupPiecewiseLinearCompressed instance + * @throws IllegalArgumentException if breakpoints are invalid or arrays are inconsistent + */ + public static AColGroup create(IColIndex colIndices, int[][] breakpointsPerCol, double[][] slopesPerCol, double[][] interceptsPerCol, int numRows) { - int expectedCols = colIndices.size(); - if(breakpointsPerCol.length != expectedCols) + final int numCols = colIndices.size(); + if(breakpointsPerCol.length != numCols) throw new IllegalArgumentException( - "bp.length=" + breakpointsPerCol.length + " != colIndices.size()=" + expectedCols); - if(breakpointsPerCol.length != colIndices.size()) - throw new IllegalArgumentException("Need at least one segment"); + "bp.length=" + breakpointsPerCol.length + " != colIndices.size()=" + numCols); - for(int c = 0; c < colIndices.size(); c++) { + for(int c = 0; c < numCols; c++) { if(breakpointsPerCol[c].length < 1 || breakpointsPerCol[c][0] != 0 || breakpointsPerCol[c][breakpointsPerCol[c].length - 1] != numRows) throw new IllegalArgumentException( @@ -59,11 +77,10 @@ public static AColGroup create(IColIndex colIndices, int[][] breakpointsPerCol, throw new IllegalArgumentException("Inconsistent array lengths col " + c); } - int numCols = colIndices.size(); int[][] bpCopy = new int[numCols][]; double[][] slopeCopy = new double[numCols][]; double[][] interceptCopy = new double[numCols][]; - + // defensive copy to prevent external modification for(int c = 0; c < numCols; c++) { bpCopy[c] = Arrays.copyOf(breakpointsPerCol[c], breakpointsPerCol[c].length); slopeCopy[c] = Arrays.copyOf(slopesPerCol[c], slopesPerCol[c].length); @@ -74,6 +91,16 @@ public static AColGroup create(IColIndex colIndices, int[][] breakpointsPerCol, } + /** + * Decompresses a ColGroupPiecewiseLinearCompress into a DenseBlock Each value is reconstructed via slopes[seg]*row + * + intercept[seg] + * + * @param db Target DenseBlock + * @param rl Row to start decompression from + * @param ru Row to end decompression at (not inclusive) + * @param offR Row offset into the target to decompress + * @param offC Column offset into the target to decompress + */ @Override public void decompressToDenseBlock(DenseBlock db, int rl, int ru, int offR, int offC) { if(db == null || _colIndexes == null || _colIndexes.size() == 0 || breakpointsPerCol == null || @@ -86,7 +113,7 @@ public void decompressToDenseBlock(DenseBlock db, int rl, int ru, int offR, int double[] slopes = slopesPerCol[col]; double[] intercepts = interceptsPerCol[col]; // per segment in this column - for(int seg = 0; seg + 1 < breakpoints.length; seg++) { // ← +1 statt length + for(int seg = 0; seg + 1 < breakpoints.length; seg++) { int segStart = breakpoints[seg]; int segEnd = breakpoints[seg + 1]; if(segStart >= segEnd) @@ -94,6 +121,7 @@ public void decompressToDenseBlock(DenseBlock db, int rl, int ru, int offR, int double currentSlopeInSegment = slopes[seg]; double currentInterceptInSegment = intercepts[seg]; + // intersect segment with requested row range [rl, ru) int rowStart = Math.max(segStart, rl); int rowEnd = Math.min(segEnd, ru); @@ -128,42 +156,64 @@ public double[][] getInterceptsPerCol() { return interceptsPerCol; } + /** + * Return a decompressed value at row r and column colIdx uses binary search to find the correct segment + * + * @param r row + * @param colIdx column index in the _colIndexes. + * @return reconstructed value with slope[segment]*r+intercepts[segment] + */ @Override public double getIdx(int r, int colIdx) { - //Check if the rowIDx is valid (safety check) + //safety check if(r < 0 || r >= numRows || colIdx < 0 || colIdx >= _colIndexes.size()) { return 0.0; } - int[] bps = breakpointsPerCol[colIdx]; - double[] slps = slopesPerCol[colIdx]; - double[] ints = interceptsPerCol[colIdx]; - // Using Binary Search for efficient Search for the right Segment ( finding rowIdx r) - // have to use int higherBound = breakpointsPerCol.length - 2 because it's the last valid segment + int[] breakpoints = breakpointsPerCol[colIdx]; + double[] slopes = slopesPerCol[colIdx]; + double[] intercepts = interceptsPerCol[colIdx]; + // binary search for the segment containing row r int lowerBound = 0; - int higherBound = bps.length - 2; + int higherBound = breakpoints.length - 2; while(lowerBound <= higherBound) { int mid = (lowerBound + higherBound) / 2; - if(r < bps[mid + 1]) { + if(r < breakpoints[mid + 1]) { higherBound = mid - 1; } else lowerBound = mid + 1; } - int segment = Math.min(lowerBound, bps.length - 2); - return slps[segment] * (double) r + ints[segment]; + int segment = Math.min(lowerBound, breakpoints.length - 2); + return slopes[segment] * (double) r + intercepts[segment]; } + /** + * Returns a total number of stored values remaining all columns counting breakpoints, slopes and intercepts per + * column + * + * @return total number of stored compression values + */ @Override public int getNumValues() { - return breakpointsPerCol.length + slopesPerCol.length + interceptsPerCol.length; + int total = 0; + for(int c = 0; c < _colIndexes.size(); c++) { + total += breakpointsPerCol[c].length + slopesPerCol[c].length + interceptsPerCol[c].length; + } + return total; } + /** + * Returns the exact size on disk in bytes includes per column arrays for breakpoints, slopes, intercepts + * + * @return size in bytes + */ @Override public long getExactSizeOnDisk() { long ret = super.getExactSizeOnDisk(); int numCols = _colIndexes.size(); - ret += 8L * numCols * 3; - ret += 24L * 3; + ret += 8L * numCols * 3; //array reference pointers + ret += 24L * 3; // outer array headers + ret += 4L; //numRows field for(int c = 0; c < numCols; c++) { ret += (long) MemoryEstimates.intArrayCost(breakpointsPerCol[c].length); @@ -171,29 +221,47 @@ public long getExactSizeOnDisk() { ret += (long) MemoryEstimates.doubleArrayCost(interceptsPerCol[c].length); } - ret += 4L; return ret; } + /** + * Computes the column sums of the decompressed matrix using sum of arithmetic series Where sumX = len * (2*start + + * len - 1) / 2 + * + * @param c output array to accumulate column sums into + * @param nRows number of rows, which is used because it is covered by the breakpoints + */ @Override public void computeSum(double[] c, int nRows) { for(int col = 0; col < _colIndexes.size(); col++) { - double colSum = 0.0; + double sum = 0.0; int[] breakpoints = breakpointsPerCol[col]; double[] intercepts = interceptsPerCol[col]; double[] slopes = slopesPerCol[col]; - for(int seg = 0; seg < breakpoints.length - 1; seg++) { - int start = breakpoints[seg], end = breakpoints[seg + 1]; + + for(int seg = 0; seg < slopes.length; seg++) { + int start = breakpoints[seg]; + int end = breakpoints[seg + 1]; int len = end - start; - double b = intercepts[seg], m = slopes[seg]; - double sumR = (double) len * (len - 1) / 2.0; - colSum += (double) len * b + m * sumR; + if(len <= 0) + continue; + + double sumX = (double) len * (2.0 * start + (len - 1)) / 2.0; + sum += slopes[seg] * sumX + intercepts[seg] * len; } - c[col] += colSum; + c[col] += sum; } } + /** + * Computes column sums by delegating to computeSum Methods are identical because every ColGroup just knows its own + * column + * + * @param c The array to add the column sum to. + * @param nRows The number of rows in the column group. + */ + @Override public void computeColSums(double[] c, int nRows) { computeSum(c, nRows); @@ -209,106 +277,127 @@ protected ColGroupType getColGroupType() { return ColGroupType.PiecewiseLinear; } + /** + * Applies a scalar operation to all segments of this column group For plus/minus operation are only the intercepts + * modified For Multiply/Divide slopes and intercepts are scaled + * + * @param op operation to perform + * @return a new ColGroupPiecewiseLinearCompressed with updated coefficients + * @throws NotImplementedException if the operator is not plus, minus, multiply or divide + */ @Override public AColGroup scalarOperation(ScalarOperator op) { final int numCols = _colIndexes.size(); + + if(!(op.fn instanceof Plus || op.fn instanceof Minus || op.fn instanceof Multiply || op.fn instanceof Divide)) { + throw new NotImplementedException("Unsupported scalar op: " + op.fn.getClass().getSimpleName()); + } + double[][] newIntercepts = new double[numCols][]; double[][] newSlopes = new double[numCols][]; - if(op.fn instanceof Plus || op.fn instanceof Minus) { - for(int col = 0; col < numCols; col++) { - int numSegments = interceptsPerCol[col].length; - newIntercepts[col] = new double[numSegments]; - newSlopes[col] = slopesPerCol[col].clone(); // Unverändert - for(int seg = 0; seg < numSegments; seg++) - newIntercepts[col][seg] = op.executeScalar(interceptsPerCol[col][seg]); - } // shift intercept - } - else if(op.fn instanceof Multiply || op.fn instanceof Divide) { - for(int col = 0; col < numCols; col++) { - int numSegments = interceptsPerCol[col].length; - newIntercepts[col] = new double[numSegments]; - newSlopes[col] = new double[numSegments]; - for(int seg = 0; seg < numSegments; seg++) { + + for(int col = 0; col < numCols; col++) { + final int numSegments = interceptsPerCol[col].length; + newIntercepts[col] = new double[numSegments]; + newSlopes[col] = new double[numSegments]; + + for(int seg = 0; seg < numSegments; seg++) { + if(op.fn instanceof Plus || op.fn instanceof Minus) { + // only intercepts changes + newSlopes[col][seg] = slopesPerCol[col][seg]; newIntercepts[col][seg] = op.executeScalar(interceptsPerCol[col][seg]); + } + else { // Multiply/Divide newSlopes[col][seg] = op.executeScalar(slopesPerCol[col][seg]); + newIntercepts[col][seg] = op.executeScalar(interceptsPerCol[col][seg]); } - }//shift slope and intercept - } - else { - throw new NotImplementedException("Unsupported scalar op"); + } } - // new ColGroup because of changed slopes, intercepts + return new ColGroupPiecewiseLinearCompressed(_colIndexes, breakpointsPerCol, newSlopes, newIntercepts, numRows); } + /** + * Applies a row vector operation from the left For plus/minus are the intercepts shifted For multiply/divide slopes + * and intercepts are scaled + * + * @param op The operation to execute + * @param v The vector of values to apply the values contained should be at least the length of the highest + * value in the column index + * @param isRowSafe True if the binary op is applied to an entire zero row and all results are zero + * @return a new ColGroupPiecewiseLinearCompressed with updated coefficients + */ + @Override public AColGroup binaryRowOpLeft(BinaryOperator op, double[] v, boolean isRowSafe) { final int numCols = _colIndexes.size(); double[][] newIntercepts = new double[numCols][]; double[][] newSlopes = new double[numCols][]; - if(op.fn instanceof Plus || op.fn instanceof Minus) { - for(int col = 0; col < numCols; col++) { - double rowValue = v[_colIndexes.get(col)]; - int numSeg = interceptsPerCol[col].length; - newIntercepts[col] = new double[numSeg]; - newSlopes[col] = slopesPerCol[col].clone(); - for(int seg = 0; seg < numSeg; seg++) { - newIntercepts[col][seg] = op.fn.execute(rowValue, interceptsPerCol[col][seg]); - } - } - } - else if(op.fn instanceof Multiply || op.fn instanceof Divide) { - for(int col = 0; col < numCols; col++) { - double rowValue = v[_colIndexes.get(col)]; - int numSeg = interceptsPerCol[col].length; - newIntercepts[col] = new double[numSeg]; - newSlopes[col] = new double[numSeg]; - for(int seg = 0; seg < numSeg; seg++) { - newIntercepts[col][seg] = op.fn.execute(rowValue, interceptsPerCol[col][seg]); + final boolean isAddSub = op.fn instanceof Plus || op.fn instanceof Minus; + + if(!isAddSub && !(op.fn instanceof Multiply || op.fn instanceof Divide)) + throw new NotImplementedException("Unsupported binary op: " + op.fn.getClass().getSimpleName()); + + for(int col = 0; col < numCols; col++) { + double rowValue = v[_colIndexes.get(col)]; + int numSegs = interceptsPerCol[col].length; + newIntercepts[col] = new double[numSegs]; + + // Plus/Minus: slope is translation-invariant, only intercept shifts + newSlopes[col] = isAddSub ? slopesPerCol[col].clone() : new double[numSegs]; + + for(int seg = 0; seg < numSegs; seg++) { + newIntercepts[col][seg] = op.fn.execute(rowValue, interceptsPerCol[col][seg]); + if(!isAddSub) newSlopes[col][seg] = op.fn.execute(rowValue, slopesPerCol[col][seg]); - } } } - else { - throw new NotImplementedException("Unsupported binary op"); - } return new ColGroupPiecewiseLinearCompressed(_colIndexes, breakpointsPerCol, newSlopes, newIntercepts, numRows); } + /** + * Applies a row vector operation from the right For plus/minus are the intercepts shifted For multiply/divide + * slopes and intercepts are scaled + * + * @param op The operation to execute + * @param v The vector of values to apply the values contained should be at least the length of the highest + * value in the column index + * @param isRowSafe True if the binary op is applied to an entire zero row and all results are zero + * @return a new ColGroupPiecewiseLinearCompressed with updated coefficients + */ @Override public AColGroup binaryRowOpRight(BinaryOperator op, double[] v, boolean isRowSafe) { final int numCols = _colIndexes.size(); - double[][] newIntercepts = new double[numCols][]; + final boolean isAddSub = op.fn instanceof Plus || op.fn instanceof Minus; + + if(!isAddSub && !(op.fn instanceof Multiply || op.fn instanceof Divide)) + throw new NotImplementedException("Unsupported scalar op: " + op.fn.getClass().getSimpleName()); + double[][] newSlopes = new double[numCols][]; - if(op.fn instanceof Plus || op.fn instanceof Minus) { - for(int col = 0; col < numCols; col++) { - double rowValue = v[_colIndexes.get(col)]; - int numSeg = interceptsPerCol[col].length; - newIntercepts[col] = new double[numSeg]; - newSlopes[col] = slopesPerCol[col].clone(); - for(int seg = 0; seg < numSeg; seg++) { - newIntercepts[col][seg] = op.fn.execute(interceptsPerCol[col][seg], rowValue); - } - } - } - else if(op.fn instanceof Multiply || op.fn instanceof Divide) { - for(int col = 0; col < numCols; col++) { - double rowValue = v[_colIndexes.get(col)]; - int numSeg = interceptsPerCol[col].length; - newIntercepts[col] = new double[numSeg]; - newSlopes[col] = new double[numSeg]; - for(int seg = 0; seg < numSeg; seg++) { - newIntercepts[col][seg] = op.fn.execute(interceptsPerCol[col][seg], rowValue); - newSlopes[col][seg] = op.fn.execute(slopesPerCol[col][seg], rowValue); - } + double[][] newIntercepts = new double[numCols][]; + + for(int col = 0; col < numCols; col++) { + double val = v[_colIndexes.get(col)]; + int numSegs = interceptsPerCol[col].length; + // Plus/Minus shifts intercept only, slopes are unchanged + newSlopes[col] = isAddSub ? slopesPerCol[col].clone() : new double[numSegs]; + newIntercepts[col] = new double[numSegs]; + + for(int seg = 0; seg < numSegs; seg++) { + newIntercepts[col][seg] = op.fn.execute(interceptsPerCol[col][seg], val); + if(!isAddSub) + newSlopes[col][seg] = op.fn.execute(slopesPerCol[col][seg], val); } } - else { - throw new NotImplementedException("Unsupported binary op"); - } return new ColGroupPiecewiseLinearCompressed(_colIndexes, breakpointsPerCol, newSlopes, newIntercepts, numRows); } + /** + * Returns true if any decompressed value in this column group equals the given pattern + * + * @param pattern The value to look for. + * @return true if pattern is found, else false + */ @Override public boolean containsValue(double pattern) { for(int col = 0; col < _colIndexes.size(); col++) { @@ -318,37 +407,39 @@ public boolean containsValue(double pattern) { return false; } + /** + * checks if any reconstructed value in column col equals the pattern for each segment, solves the m * x + b = + * pattern instead of scanning all rows + * + * @param col column index + * @param pattern the value to search for + * @return true if the pattern is found + */ + private boolean colContainsValue(int col, double pattern) { int[] breakpoints = breakpointsPerCol[col]; double[] intercepts = interceptsPerCol[col]; double[] slopes = slopesPerCol[col]; - int numSeg = breakpoints.length - 1; - - for(int seg = 0; seg < numSeg; seg++) { + for(int seg = 0; seg < breakpoints.length - 1; seg++) { int start = breakpoints[seg]; - int end = breakpoints[seg + 1]; - int len = end - start; + int len = breakpoints[seg + 1] - start; if(len <= 0) continue; - double yIntercept = intercepts[seg]; - double slope = slopes[seg]; + double b = intercepts[seg]; + double m = slopes[seg]; - if(slope == 0.0) { - if(Double.compare(yIntercept, pattern) == 0) + if(m == 0.0) { + // constant segment: all values equal b + if(Double.compare(b, pattern) == 0) return true; continue; } - if(Double.compare(yIntercept, pattern) == 0) - return true; - - double endVal = yIntercept + slope * (len - 1); - if(Double.compare(endVal, pattern) == 0) - return true; - - double rowIndex = (pattern - yIntercept) / slope; - if(rowIndex > 0 && rowIndex < (len - 1) && Double.compare(yIntercept + slope * rowIndex, pattern) == 0) + // check if pattern lies on the line: solve m*x + b = pattern for x + double x = (pattern - b) / m; + int xi = (int) x; + if(xi >= start && xi < start + len && Double.compare(m * xi + b, pattern) == 0) return true; } return false; From 739e2374aedc8730c54497dbe66927a082a70888 Mon Sep 17 00:00:00 2001 From: mori49 <84979219+mori49@users.noreply.github.com> Date: Thu, 12 Mar 2026 23:10:48 +0100 Subject: [PATCH 33/35] fix: refactoring --- .../compress/colgroup/ColGroupFactory.java | 21 ++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupFactory.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupFactory.java index dfec14d2704..76b6d04ecf2 100644 --- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupFactory.java +++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupFactory.java @@ -308,8 +308,8 @@ else if(ct == CompressionType.LinearFunctional) { else if(ct == CompressionType.PiecewiseLinear) { return compressPiecewiseLinearFunctional(colIndexes, in, cs); } - else if(ct == CompressionType.PiecewiseLinearSukzessive) { - return compressPiecewiseLinearFunctionalSukzessive(colIndexes, in, cs); + else if(ct == CompressionType.PiecewiseLinearSuccessive) { + return compressPiecewiseLinearFunctionalSuccessive(colIndexes, in, cs); } else if(ct == CompressionType.DDCFOR) { AColGroup g = directCompressDDC(colIndexes, cg); @@ -1075,6 +1075,17 @@ private static AColGroup compressLinearFunctional(IColIndex colIndexes, MatrixBl return ColGroupLinearFunctional.create(colIndexes, coefficients, numRows); } + /** + * This method is the entry point to compress a matrix with piecewise linear compression The first method uses a + * segmented least squares with dynamic programming to compress the columns The second method uses a successive + * compression method, which compares each values in linear time and checks if the targetloss exceeded + * + * @param colIndexes the column indices to compress + * @param in the input Matrixblock containing the data + * @param cs compression settings to define the target loss, which should be considered + * @return a piecewise linear compressed column group + */ + public static AColGroup compressPiecewiseLinearFunctional(IColIndex colIndexes, MatrixBlock in, CompressionSettings cs) { @@ -1099,7 +1110,7 @@ public static AColGroup compressPiecewiseLinearFunctional(IColIndex colIndexes, } - public static AColGroup compressPiecewiseLinearFunctionalSukzessive(IColIndex colIndexes, MatrixBlock in, + public static AColGroup compressPiecewiseLinearFunctionalSuccessive(IColIndex colIndexes, MatrixBlock in, CompressionSettings cs) { final int numRows = in.getNumRows(); final int numCols = colIndexes.size(); @@ -1110,8 +1121,8 @@ public static AColGroup compressPiecewiseLinearFunctionalSukzessive(IColIndex co for(int col = 0; col < numCols; col++) { final int colIdx = colIndexes.get(col); double[] column = PiecewiseLinearUtils.getColumn(in, colIdx); - PiecewiseLinearUtils.SegmentedRegression fit = PiecewiseLinearUtils.compressSukzessivePiecewiseLinear(column, - cs); + PiecewiseLinearUtils.SegmentedRegression fit = PiecewiseLinearUtils.compressSuccessivePiecewiseLinear( + column, cs); breakpointsPerCol[col] = fit.getBreakpoints(); interceptsPerCol[col] = fit.getIntercepts(); slopesPerCol[col] = fit.getSlopes(); From 4bd8e0be5e8cfc1b420f666f4f369bf8dda09a3e Mon Sep 17 00:00:00 2001 From: mori49 <84979219+mori49@users.noreply.github.com> Date: Thu, 12 Mar 2026 23:10:57 +0100 Subject: [PATCH 34/35] fix: refactoring --- .../org/apache/sysds/runtime/compress/colgroup/AColGroup.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/AColGroup.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/AColGroup.java index 07382ed932b..995837a6ad8 100644 --- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/AColGroup.java +++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/AColGroup.java @@ -66,7 +66,7 @@ public abstract class AColGroup implements Serializable { /** Public super types of compression ColGroups supported */ public static enum CompressionType { UNCOMPRESSED, RLE, OLE, DDC, CONST, EMPTY, SDC, SDCFOR, DDCFOR, DeltaDDC, LinearFunctional, PiecewiseLinear, - PiecewiseLinearSukzessive; + PiecewiseLinearSuccessive; public boolean isDense() { return this == DDC || this == CONST || this == DDCFOR || this == DDCFOR; From affa40ab83b8e642919182ed4e9aa3368e14b22b Mon Sep 17 00:00:00 2001 From: mori49 <84979219+mori49@users.noreply.github.com> Date: Sat, 14 Mar 2026 00:01:09 +0100 Subject: [PATCH 35/35] performance test --- ...ewiseLinearCompressionPerformanceTest.java | 168 ++++++++++++++++++ 1 file changed, 168 insertions(+) create mode 100644 src/test/java/org/apache/sysds/performance/PiecewiseLinearCompressionPerformanceTest.java diff --git a/src/test/java/org/apache/sysds/performance/PiecewiseLinearCompressionPerformanceTest.java b/src/test/java/org/apache/sysds/performance/PiecewiseLinearCompressionPerformanceTest.java new file mode 100644 index 00000000000..6046bdfb20b --- /dev/null +++ b/src/test/java/org/apache/sysds/performance/PiecewiseLinearCompressionPerformanceTest.java @@ -0,0 +1,168 @@ +package org.apache.sysds.performance; + +import org.apache.sysds.runtime.compress.CompressionSettings; +import org.apache.sysds.runtime.compress.CompressionSettingsBuilder; +import org.apache.sysds.runtime.compress.colgroup.AColGroup; +import org.apache.sysds.runtime.compress.colgroup.ColGroupFactory; +import org.apache.sysds.runtime.compress.colgroup.ColGroupPiecewiseLinearCompressed; +import org.apache.sysds.runtime.compress.colgroup.indexes.ColIndexFactory; +import org.apache.sysds.runtime.compress.colgroup.indexes.IColIndex; +import org.apache.sysds.runtime.matrix.data.MatrixBlock; +import org.apache.sysds.utils.stats.Timing; +import java.util.Random; + +/** + * Performance benchmark for piecewise linear compression. + * Successive is benchmarked across large matrices to show scalability. + * DP is only used as a quality reference on small matrices due to quadratic complexity + + */ +public class PiecewiseLinearCompressionPerformanceTest { + + //different target losses : loose, avg, strict + private static final double[] LOSSES = {1e-1, 1e-2, 1e-4}; + // how often compressed + private static final int REPS = 3; + + /** + * generate of a time series matrix to have a realistic test set up + * @param nr number of rows + * @param nc number of columns + * @return matrix with random generated data + */ + private static MatrixBlock generateTestMatrix(int nr, int nc) { + MatrixBlock mb = new MatrixBlock(nr, nc, true); + mb.allocateDenseBlock(); + Random rng = new Random(42); + for(int c = 0; c < nc; c++) { + double trend = 0.001 * c; + double level = rng.nextDouble() * 5.0; + double volatility = 0.1 + 0.01 * c; + double residual = 0.0; + + for(int row = 0; row < nr; row++) { + // random level shift every 75-150 rows + if(row % (75 + (int)(75 * rng.nextDouble())) == 0) { + level += (rng.nextDouble() - 0.5) * 2.0; + trend += (rng.nextDouble() - 0.5) * 0.0005; + } + // noise: residual = 0.7 * prev + random + residual = 0.7 * residual + rng.nextGaussian() * volatility; + mb.set(row, c, Math.max(0, trend * row + level + residual)); + } + } + return mb; + } + /// returns a average number of segments per column + private static double avgSegments(AColGroup cg) { + int[][] breakpoints = ((ColGroupPiecewiseLinearCompressed) cg).getBreakpointsPerCol(); + int total = 0; + for(int[] bp : breakpoints) total += bp.length - 1; + return total / (double) breakpoints.length; + } + + /** + * computes MSE between the compression, the original data and decompression + * @param orig original matrix + * @param cg piecewise linear compressed column group + * @return MSE + */ + private static double reconstructionMSE(MatrixBlock orig, AColGroup cg) { + int nr = orig.getNumRows(), nc = orig.getNumColumns(); + MatrixBlock recon = new MatrixBlock(nr, nc, false); + recon.allocateDenseBlock(); + cg.decompressToDenseBlock(recon.getDenseBlock(), 0, nr, 0, 0); + double sse = 0; + for(int r = 0; r < nr; r++) + for(int c = 0; c < nc; c++) { + double diff = orig.get(r, c) - recon.get(r, c); + sse += diff * diff; + } + return sse / (nr * nc); + } + + /** + * benchmarks successive compression for a given matrix and target loss + * reports segments, compressed data size, runtime and reconstruction + * @param mb original matrix to compress + * @param loss target loss param + */ + private static void benchmarkSuccessive(MatrixBlock mb, double loss) { + long origSize = mb.getInMemorySize(); + int numRows = mb.getNumRows(), numCol = mb.getNumColumns(); + CompressionSettings cs = new CompressionSettingsBuilder().create(); + cs.setPiecewiseTargetLoss(loss); + IColIndex colIndexes = ColIndexFactory.create(numCol); + + ColGroupFactory.compressPiecewiseLinearFunctionalSuccessive(colIndexes, mb, cs); + + Timing t = new Timing(); + AColGroup cg = null; + t.start(); + for(int i = 0; i < REPS; i++) + cg = ColGroupFactory.compressPiecewiseLinearFunctionalSuccessive(colIndexes, mb, cs); + double time = t.stop() / REPS; + + long size = cg.getExactSizeOnDisk(); + String saving = size < origSize + ? String.format("saved %3.0f%%", 100.0 - 100.0 * size / origSize) + : String.format("larger +%.0f%%", 100.0 * size / origSize - 100); + + System.out.printf(" successive loss=%.0e %5.1f segs %6.2f MB (%s) %6.1f ms MSE=%.2e%n", + loss, avgSegments(cg), size / 1e6, saving, time, reconstructionMSE(mb, cg)); + } + + /** + * benchmarks dynamic programming compression for a given matrix and target loss + * no repetition, because DP is too slow due complexity + * reports segments, compressed data size, runtime and reconstruction + * @param mb original matrix to compress + * @param loss target loss param + */ + private static void benchmarkDP(MatrixBlock mb, double loss) { + long origSize = mb.getInMemorySize(); + int numColumns = mb.getNumColumns(); + CompressionSettings cs = new CompressionSettingsBuilder().create(); + cs.setPiecewiseTargetLoss(loss); + IColIndex colIndexes = ColIndexFactory.create(numColumns); + + Timing t = new Timing(); + t.start(); + AColGroup cg = ColGroupFactory.compressPiecewiseLinearFunctional(colIndexes, mb, cs); + double time = t.stop(); + + long size = cg.getExactSizeOnDisk(); + String saving = size < origSize + ? String.format("saved %3.0f%%", 100.0 - 100.0 * size / origSize) + : String.format("LARGER +%.0f%%", 100.0 * size / origSize - 100); + + System.out.printf(" DP loss=%.0e %5.1f segs %6.2f MB (%s) %6.1f ms MSE=%.2e%n", + loss, avgSegments(cg), size / 1e6, saving, time, reconstructionMSE(mb, cg)); + } + + public static void main(String[] args) { + System.out.println("=== Piecewise Linear Compression Benchmark ===\n"); + + // Successive scalability across large matrices + System.out.println("=== Successive: scalability ==="); + int[][] configs = {{1000, 10}, {1000, 100}, {1000, 500}, + {5000, 10}, {5000, 100}, {5000, 500}, + {10000, 10}, {10000, 100}, {10000, 500}}; + + for(int[] cfg : configs) { + int nr = cfg[0], nc = cfg[1]; + MatrixBlock mb = generateTestMatrix(nr, nc); + System.out.printf("%nnrows=%d ncols=%d original=%.2f MB%n", + nr, nc, mb.getInMemorySize() / 1e6); + for(double loss : LOSSES) + benchmarkSuccessive(mb, loss); + } + + // DP quality reference on small matrix + System.out.println("\n=== DP: quality reference (nrows=1000, ncols=10) ==="); + MatrixBlock mbSmall = generateTestMatrix(1000, 10); + System.out.printf("original=%.2f MB%n", mbSmall.getInMemorySize() / 1e6); + for(double loss : LOSSES) + benchmarkDP(mbSmall, loss); + } +}