diff --git a/.gitignore b/.gitignore
index d2fcdb9a4de..4cc5f69557c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -7,9 +7,9 @@
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
-#
+#
# http://www.apache.org/licenses/LICENSE-2.0
-#
+#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -127,7 +127,7 @@ derby.log
src/main/cpp/build
src/main/cpp/bin
-# legacy dml
+# legacy dml
*.dmlt
# Performance Test artifacts
@@ -157,3 +157,16 @@ docker/mountFolder/*.bin.mtd
SEAL-*/
+data/lineorder.tbl
+data/test/lineorder.tbl
+# SSB Data and subfolders
+/data/
+shell/ssbOutputData/
+/sql/ssb.duckdb
+# SSB data, outputs, and local DB file
+/data/
+/scripts/ssb/shell/ssbOutputData/
+/scripts/ssb/sql/ssb.duckdb
+
+# Auto-generated single-thread config for SSB runs
+/conf/single_thread.xml
diff --git a/conf/single_thread.xml b/conf/single_thread.xml
new file mode 100644
index 00000000000..dba1ac6b805
--- /dev/null
+++ b/conf/single_thread.xml
@@ -0,0 +1,8 @@
+
+
+ sysds.cp.parallel.opsfalse
+
+
+ sysds.num.threads1
+
+
diff --git a/scripts/ssb/README.md b/scripts/ssb/README.md
new file mode 100644
index 00000000000..397350807e5
--- /dev/null
+++ b/scripts/ssb/README.md
@@ -0,0 +1,505 @@
+# Star Schema Benchmark (SSB) for SystemDS
+
+This README documents the SSB DML queries under `scripts/ssb/queries/` and the runner scripts under `scripts/ssb/shell/` that execute and benchmark them. It is focused on what is implemented today, how to run it, and how to interpret the outputs for performance analysis.
+
+---
+
+## Table of Contents
+
+1. Project Layout
+2. Quick Start
+3. Data Location (`--input-dir` and DML `input_dir`)
+4. Single-Engine Runner (`scripts/ssb/shell/run_ssb.sh`)
+5. Multi-Engine Performance Runner (`scripts/ssb/shell/run_all_perf.sh`)
+6. Outputs and Examples
+7. Adding/Editing Queries
+8. Troubleshooting
+
+---
+
+## 1) Project Layout
+
+Paths are relative to the repo root:
+
+```
+systemds/
+├── scripts/ssb/
+│ ├── README.md # This guide
+│ ├── queries/ # DML queries (q1_1.dml ... q4_3.dml)
+│ │ ├── q1_1.dml - q1_3.dml # Flight 1
+│ │ ├── q2_1.dml - q2_3.dml # Flight 2
+│ │ ├── q3_1.dml - q3_4.dml # Flight 3
+│ │ └── q4_1.dml - q4_3.dml # Flight 4
+│ ├── shell/
+│ │ ├── run_ssb.sh # Single-engine (SystemDS) runner
+│ │ ├── run_all_perf.sh # Multi-engine performance benchmark
+│ │ └── ssbOutputData/ # Results (created on first run)
+│ │ ├── QueryData/ # Per-query outputs from run_ssb.sh
+│ │ └── PerformanceData/ # Multi-engine outputs from run_all_perf.sh
+│ └── sql/ # SQL versions + `ssb.duckdb` for DuckDB
+```
+
+Note: The SSB raw data directory is not committed. You must point the runners to your generated data with `--input-dir`.
+
+---
+
+## 2) Quick Start
+
+Set up SystemDS and run the SSB queries.
+
+1) Build SystemDS (from repo root):
+
+```bash
+mvn -DskipTests package
+```
+
+2) Make sure the SystemDS binary exists (repo-local `bin/systemds` or on `PATH`).
+
+3) Make runner scripts executable:
+
+```bash
+chmod +x scripts/ssb/shell/run_ssb.sh scripts/ssb/shell/run_all_perf.sh
+```
+
+4) Provide SSB data (from dbgen) in a directory, e.g. `/path/to/ssb-data`.
+
+5) Run a single SSB query on SystemDS (from repo root):
+
+```bash
+scripts/ssb/shell/run_ssb.sh q1.1 --input-dir=/path/to/ssb-data --stats
+```
+
+6) Run the multi-engine performance benchmark across all queries (from repo root):
+
+```bash
+scripts/ssb/shell/run_all_perf.sh --input-dir=/path/to/ssb-data --stats --repeats=5
+```
+
+If `--input-dir` is omitted, the scripts default to `./data/` under the repo root.
+
+---
+
+## 3) Data Location (`--input-dir` and DML `input_dir`)
+
+Both runners pass a named argument `input_dir` into DML as:
+
+```
+-nvargs input_dir=/absolute/path/to/ssb-data
+```
+
+Your DML scripts should construct paths from `input_dir`. Example:
+
+```dml
+dates = read(paste(input_dir, "/date.tbl", sep=""), data_type="frame", format="csv", sep="|", header=FALSE)
+lineorder = read(paste(input_dir, "/lineorder.tbl", sep=""), data_type="frame", format="csv", sep="|", header=FALSE)
+```
+
+Expected base files in `input_dir`: `customer.tbl`, `supplier.tbl`, `part.tbl`, `date.tbl` and `lineorder*.tbl` (fact table name can vary by scale). The runners validate that `--input-dir` exists before executing.
+
+---
+
+## 4) Single-Engine Runner (`scripts/ssb/shell/run_ssb.sh`)
+
+Runs SSB DML queries with SystemDS and saves results per query.
+
+- Usage:
+ - `scripts/ssb/shell/run_ssb.sh` — run all SSB queries
+ - `scripts/ssb/shell/run_ssb.sh q1.1 q2.3` — run specific queries
+ - `scripts/ssb/shell/run_ssb.sh --stats` — include SystemDS internal statistics
+ - `scripts/ssb/shell/run_ssb.sh --input-dir=/path/to/data` — set data dir
+ - `scripts/ssb/shell/run_ssb.sh --output-dir=/tmp/out` — set output dir
+
+- Query names: You can use dotted form (`q1.1`); the runner maps to `q1_1.dml` internally.
+
+- Functionality:
+ - Single-threaded execution via auto-generated `conf/single_thread.xml`.
+ - DML `input_dir` forwarding with `-nvargs`.
+ - Pre-check for data directory; clear errors if missing.
+ - Runtime error detection by scanning for “An Error Occurred : …”.
+ - Optional `--stats` to capture SystemDS internal statistics in JSON.
+ - Per-query outputs in TXT, CSV, and JSON.
+ - `run.json` with run-level metadata and per-query status/results.
+ - Clear end-of-run summary and, for table results, a “DETAILED QUERY RESULTS” section.
+ - Exit code is non-zero if any query failed (handy for CI).
+
+- Output layout:
+ - Base directory: `--output-dir` (default: `scripts/ssb/shell/ssbOutputData/QueryData`)
+ - Each run: `ssb_run_/`
+ - `txt/.txt` — human-readable result
+ - `csv/.csv` — scalar or table as CSV
+ - `json/.json` — per-query JSON
+ - `run.json` — full metadata and results for the run
+
+- Example console output (abridged):
+
+```
+[1/13] Running: q1_1.dml
+...
+=========================================
+SSB benchmark completed!
+Total queries executed: 13
+Failed queries: 0
+Statistics: enabled
+
+=========================================
+RUN METADATA SUMMARY
+=========================================
+Timestamp: 2025-09-05 12:34:56 UTC
+Hostname: myhost
+Seed: 123456
+Software Versions:
+ SystemDS: 3.4.0-SNAPSHOT
+ JDK: 21.0.2
+System Resources:
+ CPU: Apple M2
+ RAM: 16GB
+Data Build Info:
+ SSB Data: customer:300000 part:200000 supplier:2000 lineorder:6001215
+=========================================
+
+===================================================
+QUERIES SUMMARY
+===================================================
+No. Query Result Status
+---------------------------------------------------
+1 q1.1 12 rows (see below) ✓ Success
+2 q1.2 1 ✓ Success
+...
+===================================================
+
+=========================================
+DETAILED QUERY RESULTS
+=========================================
+[1] Results for q1.1:
+----------------------------------------
+1992|ASIA|12345.67
+1993|ASIA|23456.78
+...
+----------------------------------------
+```
+
+---
+
+## 5) Multi-Engine Performance Runner (`scripts/ssb/shell/run_all_perf.sh`)
+
+Runs SSB queries across SystemDS, PostgreSQL, and DuckDB with repeated timings and statistical analysis.
+
+- Usage:
+ - `scripts/ssb/shell/run_all_perf.sh` — run all queries on available engines
+ - `scripts/ssb/shell/run_all_perf.sh q1.1 q2.3` — run specific queries
+ - `scripts/ssb/shell/run_all_perf.sh --warmup=2 --repeats=10` — control sampling
+ - `scripts/ssb/shell/run_all_perf.sh --stats` — include core/internal engine timings
+ - `scripts/ssb/shell/run_all_perf.sh --layout=wide|stacked` — control terminal layout
+ - `scripts/ssb/shell/run_all_perf.sh --input-dir=... --output-dir=...` — set paths
+
+- Query names: dotted form (`q1.1`) is accepted; mapped internally to `q1_1.dml`.
+
+- Engine prerequisites:
+ - PostgreSQL:
+ - Install `psql` CLI and ensure a PostgreSQL server is running.
+ - Default connection in the script: `POSTGRES_DB=ssb`, `POSTGRES_USER=$(whoami)`, `POSTGRES_HOST=localhost`.
+ - Create the `ssb` database and load the standard SSB tables and data (schema not included in this repo). The SQL queries under `scripts/ssb/sql/` expect the canonical SSB schema and data.
+ - The runner verifies connectivity; if it cannot connect or tables are missing, PostgreSQL results are skipped.
+ - DuckDB:
+ - Install the DuckDB CLI (`duckdb`).
+ - The runner looks for the database at `scripts/ssb/sql/ssb.duckdb`. Ensure it contains SSB tables and data.
+ - If the CLI is missing or the DB file cannot be opened, DuckDB results are skipped.
+ - SystemDS is required; the other engines are optional. Missing engines are reported and skipped gracefully.
+
+- Functionality:
+ - Single-threaded execution for fairness (SystemDS config; SQL engines via settings).
+ - Pre-flight data-dir check and SystemDS test-run with runtime-error detection.
+ - Warmups and repeated measurements using `/usr/bin/time -p` (ms resolution).
+ - Statistics per engine: mean, population stdev, p95, and CV%.
+ - “Shell” vs “Core” time: SystemDS core from `-stats`, PostgreSQL core via EXPLAIN ANALYZE, DuckDB core via JSON profiling.
+ - Environment verification: gracefully skips PostgreSQL or DuckDB if not available.
+ - Terminal-aware output: wide table with grid or stacked multi-line layout.
+ - Results to CSV and JSON with rich metadata (system info, versions, run config).
+
+- Layouts (display formats):
+ - Auto selection: `--layout=auto` (default). Chooses `wide` if terminal is wide enough, else `stacked`.
+ - Wide layout: `--layout=wide`. Prints a grid with columns for each engine and a `Fastest` column. Three header rows show labels for `mean`, `±/CV`, and `p95`.
+ - Stacked layout: `--layout=stacked` or `--stacked`. Prints a compact, multi-line block per query (best for narrow terminals).
+ - Dynamic scaling: The wide layout scales column widths to fit the terminal; if still too narrow, it falls back to stacked.
+ - Row semantics: Row 1 = mean (ms); Row 2 = `±stdev/CV%`; Row 3 = `p95 (ms)`.
+ - Fastest: The runner highlights the engine with the lowest mean per query.
+
+- Output layout:
+ - Base directory: `--output-dir` (default: `scripts/ssb/shell/ssbOutputData/PerformanceData`)
+ - Files per run (timestamped basename):
+ - `ssb_results_.csv`
+ - `ssb_results_.json`
+
+- Example console output (abridged, wide layout):
+
+```
+==================================================================================
+ MULTI-ENGINE PERFORMANCE BENCHMARK METADATA
+==================================================================================
+Timestamp: 2025-09-05 12:34:56 UTC
+Hostname: myhost
+Seed: 123456
+Software Versions:
+ SystemDS: 3.4.0-SNAPSHOT
+ JDK: 21.0.2
+ PostgreSQL: psql (PostgreSQL) 14.11
+ DuckDB: v0.10.3
+System Resources:
+ CPU: Apple M2
+ RAM: 16GB
+Data Build Info:
+ SSB Data: customer:300000 part:200000 supplier:2000 lineorder:6001215
+Run Configuration:
+ Statistics: enabled
+ Queries: 13 selected
+ Warmup Runs: 1
+ Repeat Runs: 5
+
++--------+--------------+--------------+--------------+----------------+--------------+----------------+----------+
+| Query | SysDS Shell | SysDS Core | PostgreSQL | PostgreSQL Core| DuckDB | DuckDB Core | Fastest |
+| | mean | mean | mean | mean | mean | mean | |
+| | ±/CV | ±/CV | ±/CV | ±/CV | ±/CV | ±/CV | |
+| | p95 | p95 | p95 | p95 | p95 | p95 | |
++--------+--------------+--------------+--------------+----------------+--------------+----------------+----------+
+| q1_1 | 1824.0 | 1210.0 | 2410.0 | 2250.0 | 980.0 | 910.0 | DuckDB |
+| | ±10.2/0.6% | ±8.6/0.7% | ±15.1/0.6% | ±14.0/0.6% | ±5.4/0.6% | ±5.0/0.5% | |
+| | p95:1840.0 | p95:1225.0 | p95:2435.0 | p95:2274.0 | p95:989.0 | p95:919.0 | |
++--------+--------------+--------------+--------------+----------------+--------------+----------------+----------+
+```
+
+- Example console output (abridged, stacked layout):
+
+```
+Query : q1_1 Fastest: DuckDB
+ SystemDS Shell: 1824.0
+ ±10.2ms/0.6%
+ p95:1840.0ms
+ SystemDS Core: 1210.0
+ ±8.6ms/0.7%
+ p95:1225.0ms
+ PostgreSQL: 2410.0
+ ±15.1ms/0.6%
+ p95:2435.0ms
+ PostgreSQL Core:2250.0
+ ±14.0ms/0.6%
+ p95:2274.0ms
+ DuckDB: 980.0
+ ±5.4ms/0.6%
+ p95:989.0ms
+ DuckDB Core: 910.0
+ ±5.0ms/0.5%
+ p95:919.0ms
+--------------------------------------------------------------------------------
+```
+
+---
+
+## 6) Outputs and Examples
+
+Where to find results and how to read them.
+
+- SystemDS-only runner (`scripts/ssb/shell/run_ssb.sh`):
+ - Directory: `scripts/ssb/shell/ssbOutputData/QueryData/ssb_run_/`
+ - Files: `txt/.txt`, `csv/.csv`, `json/.json`, and `run.json`
+ - `run.json` example (stats enabled, single query):
+
+```json
+{
+ "benchmark_type": "ssb_systemds",
+ "timestamp": "2025-09-07 19:45:11 UTC",
+ "hostname": "eduroam-141-23-175-117.wlan.tu-berlin.de",
+ "seed": 849958376,
+ "software_versions": {
+ "systemds": "3.4.0-SNAPSHOT",
+ "jdk": "17.0.15"
+ },
+ "system_resources": {
+ "cpu": "Apple M1 Pro",
+ "ram": "16GB"
+ },
+ "data_build_info": {
+ "customer": "30000",
+ "part": "200000",
+ "supplier": "2000",
+ "date": "2557",
+ "lineorder": "8217"
+ },
+ "run_configuration": {
+ "statistics_enabled": true,
+ "queries_selected": 1,
+ "queries_executed": 1,
+ "queries_failed": 0
+ },
+ "results": [
+ {
+ "query": "q1_1",
+ "result": "687752409 ",
+ "stats": [
+ "SystemDS Statistics:",
+ "Total elapsed time: 1.557 sec.",
+ "Total compilation time: 0.410 sec.",
+ "Total execution time: 1.147 sec.",
+ "Cache hits (Mem/Li/WB/FS/HDFS): 11054/0/0/0/2.",
+ "Cache writes (Li/WB/FS/HDFS): 0/26/3/0.",
+ "Cache times (ACQr/m, RLS, EXP): 0.166/0.001/0.060/0.000 sec.",
+ "HOP DAGs recompiled (PRED, SB): 0/175.",
+ "HOP DAGs recompile time: 0.063 sec.",
+ "Functions recompiled: 2.",
+ "Functions recompile time: 0.016 sec.",
+ "Total JIT compile time: 1.385 sec.",
+ "Total JVM GC count: 1.",
+ "Total JVM GC time: 0.026 sec.",
+ "Heavy hitter instructions:",
+ " # Instruction Time(s) Count",
+ " 1 m_raJoin 0.940 1",
+ " 2 ucumk+ 0.363 3",
+ " 3 - 0.219 1345",
+ " 4 nrow 0.166 7",
+ " 5 ctable 0.086 2",
+ " 6 * 0.078 1",
+ " 7 parallelBinarySearch 0.069 1",
+ " 8 ba+* 0.049 5",
+ " 9 rightIndex 0.016 8611",
+ " 10 leftIndex 0.015 1680"
+ ],
+ "status": "success"
+ }
+ ]
+}
+```
+
+ Notes:
+ - The `result` field contains the query’s output (scalar or tabular content collapsed). When `--stats` is used, `stats` contains the full SystemDS statistics block line-by-line.
+ - For failed queries, an `error_message` string is included and `status` is set to `"error"`.
+
+- Multi-engine runner (`scripts/ssb/shell/run_all_perf.sh`):
+ - Directory: `scripts/ssb/shell/ssbOutputData/PerformanceData/`
+ - Files per run: `ssb_results_.csv` and `.json`
+ - CSV contains display strings and raw numeric stats (mean/stdev/p95) for each engine; JSON contains the same plus metadata and fastest-engine per query.
+ - `ssb_results_*.json` example (stats enabled, single query):
+
+```json
+{
+ "benchmark_metadata": {
+ "benchmark_type": "multi_engine_performance",
+ "timestamp": "2025-09-07 20:11:16 UTC",
+ "hostname": "eduroam-141-23-175-117.wlan.tu-berlin.de",
+ "seed": 578860764,
+ "software_versions": {
+ "systemds": "3.4.0-SNAPSHOT",
+ "jdk": "17.0.15",
+ "postgresql": "psql (PostgreSQL) 17.5",
+ "duckdb": "v1.3.2 (Ossivalis) 0b83e5d2f6"
+ },
+ "system_resources": {
+ "cpu": "Apple M1 Pro",
+ "ram": "16GB"
+ },
+ "data_build_info": {
+ "customer": "30000",
+ "part": "200000",
+ "supplier": "2000",
+ "date": "2557",
+ "lineorder": "8217"
+ },
+ "run_configuration": {
+ "statistics_enabled": true,
+ "queries_selected": 1,
+ "warmup_runs": 1,
+ "repeat_runs": 5
+ }
+ },
+ "results": [
+ {
+ "query": "q1_1",
+ "systemds": {
+ "shell": {
+ "display": "2186.0 (±95.6ms/4.4%, p95:2250.0ms)",
+ "mean_ms": 2186.0,
+ "stdev_ms": 95.6,
+ "p95_ms": 2250.0
+ },
+ "core": {
+ "display": "1151.2 (±115.3ms/10.0%, p95:1334.0ms)",
+ "mean_ms": 1151.2,
+ "stdev_ms": 115.3,
+ "p95_ms": 1334.0
+ },
+ "status": "success",
+ "error_message": null
+ },
+ "postgresql": {
+ "display": "26.0 (±4.9ms/18.8%, p95:30.0ms)",
+ "mean_ms": 26.0,
+ "stdev_ms": 4.9,
+ "p95_ms": 30.0
+ },
+ "postgresql_core": {
+ "display": "3.8 (±1.4ms/36.8%, p95:5.7ms)",
+ "mean_ms": 3.8,
+ "stdev_ms": 1.4,
+ "p95_ms": 5.7
+ },
+ "duckdb": {
+ "display": "30.0 (±0.0ms/0.0%, p95:30.0ms)",
+ "mean_ms": 30.0,
+ "stdev_ms": 0.0,
+ "p95_ms": 30.0
+ },
+ "duckdb_core": {
+ "display": "1.1 (±0.1ms/9.1%, p95:1.3ms)",
+ "mean_ms": 1.1,
+ "stdev_ms": 0.1,
+ "p95_ms": 1.3
+ },
+ "fastest_engine": "PostgreSQL"
+ }
+ ]
+}
+```
+
+ Differences at a glance:
+ - Single-engine `run.json` focuses on query output (`result`) and, when enabled, the SystemDS `stats` array. Status and error handling are per-query.
+ - Multi-engine results JSON focuses on timing statistics for each engine (`shell` vs `core` for SystemDS; `postgresql`/`postgresql_core`; `duckdb`/`duckdb_core`) along with a `fastest_engine` field. It does not include the query’s actual result values.
+
+---
+
+## 7) Adding/Editing Queries
+
+Guidelines for DML in `scripts/ssb/queries/`:
+
+- Name files as `qX_Y.dml` (e.g., `q1_1.dml`). The runners accept `q1.1` on the CLI and map it for you.
+- Always derive paths from `input_dir` named argument (see Section 3).
+- Keep I/O separate from compute where possible (helps early error detection).
+- Add a short header comment with original SQL and intent.
+
+Example header:
+
+```dml
+/*
+ SQL: SELECT ...
+ Description: Revenue per month by supplier region
+*/
+```
+
+---
+
+## 8) Troubleshooting
+
+- Missing data directory: pass `--input-dir=/path/to/ssb-data` and ensure `*.tbl` files exist.
+- SystemDS not found: build (`mvn -DskipTests package`) and use `./bin/systemds` or ensure `systemds` is on PATH.
+- Query fails with runtime error: the runners mark `status: "error"` and include a short `error_message` in JSON outputs. See console snippet for context.
+- macOS cache dropping: OS caches cannot be dropped like Linux; the multi-engine runner mitigates with warmups + repeated averages and reports p95/CV.
+
+If something looks off, attach the relevant `run.json` or `ssb_results_*.json` when filing issues.
+
+- To debug DML runtime errors, run the DML directly:
+
+```bash
+./bin/systemds -f scripts/ssb/queries/q1_1.dml -nvargs input_dir=/path/to/data
+```
+
+- When `--stats` is enabled, SystemDS internal "core" timing is extracted and reported separately (useful to separate JVM / startup overhead from core computation).
+
+All these metrics appear in the generated CSVs and JSON entries.
+- Permission errors: `chmod +x scripts/ssb/shell/*.sh`.
diff --git a/scripts/ssb/queries/q1_1.dml b/scripts/ssb/queries/q1_1.dml
new file mode 100644
index 00000000000..295118ecd38
--- /dev/null
+++ b/scripts/ssb/queries/q1_1.dml
@@ -0,0 +1,70 @@
+/* DML-script implementing the ssb query Q1.1 in SystemDS.
+SELECT SUM(lo_extendedprice * lo_discount) AS REVENUE
+FROM lineorder, dates
+WHERE
+ lo_orderdate = d_datekey
+ AND d_year = 1993
+ AND lo_discount BETWEEN 1 AND 3
+ AND lo_quantity < 25;
+
+Usage:
+./bin/systemds scripts/ssb/queries/q1_1.dml -nvargs input_dir="/path/to/data"
+./bin/systemds scripts/ssb/queries/q1_1.dml -nvargs input_dir="/Users/ghafekalsaho/Desktop/data"
+or with explicit -f flag:
+./bin/systemds -f scripts/ssb/queries/q1_1.dml -nvargs input_dir="/path/to/data"
+
+Parameters:
+input_dir - Path to input directory containing the table files (e.g., ./data)
+*/
+# -- SOURCING THE RA-FUNCTIONS --
+source("./scripts/builtin/raSelection.dml") as raSel
+source("./scripts/builtin/raJoin.dml") as raJoin
+
+# -- PARAMETER HANDLING --
+input_dir = ifdef($input_dir, "./data");
+print("Loading tables from directory: " + input_dir);
+
+# -- READING INPUT FILES --
+# CSV TABLES
+date_csv = read(input_dir + "/date.tbl", data_type="frame", format="csv", header=FALSE, sep="|");
+lineorder_csv = read(input_dir + "/lineorder.tbl", data_type="frame", format="csv", header=FALSE, sep="|");
+
+
+# -- PREPARING --
+# EXTRACTING MINIMAL DATE DATA TO OPTIMIZE RUNTIME => COL-1 : DATE-KEY | COL-5 : YEAR
+date_csv_min = cbind(date_csv[, 1], date_csv[, 5]);
+date_matrix_min = as.matrix(date_csv_min);
+
+# EXTRACTING MINIMAL LINEORDER DATA TO OPTIMIZE RUNTIME => COL-6 : LO_ORDERDATE |
+# COL-9 : LO_QUANTITY | COL-10 : LO_EXTPRICE | COL-12 : LO_DISCOUNT
+lineorder_csv_min = cbind(lineorder_csv[, 6], lineorder_csv[, 9], lineorder_csv[, 10], lineorder_csv[, 12]);
+lineorder_matrix_min = as.matrix(lineorder_csv_min);
+
+
+# -- FILTERING THE DATA WITH RA-SELECTION FUNCTION --
+d_year_filt = raSel::m_raSelection(date_matrix_min, col=2, op="==", val=1993); # D_YEAR = '1993'
+
+# LO_QUANTITY < 25
+lo_quan_filt = raSel::m_raSelection(lineorder_matrix_min, col=2, op="<", val=25);
+
+# LO_DISCOUNT BETWEEN 1 AND 3
+lo_quan_disc_filt = raSel::m_raSelection(lo_quan_filt, col=4, op=">=", val=1);
+lo_quan_disc_filt = raSel::m_raSelection(lo_quan_disc_filt, col=4, op="<=", val=3);
+
+
+# -- JOIN TABLES WITH RA-JOIN FUNCTION --
+# JOINING FILTERED LINEORDER TABLE WITH FILTERED DATE TABLE WHERE LO_ORDERDATE = D_DATEKEY
+joined_matrix = raJoin::m_raJoin(A=lo_quan_disc_filt, colA=1, B=d_year_filt, colB=1, method="sort-merge");
+#print("LO-DATE JOINED.");
+
+
+# -- AGGREGATION --
+lo_extprice = joined_matrix[, 3]; #LO_EXTPRICE : 3 COLUMN OF JOINED-MATRIX
+lo_disc = joined_matrix[, 4]; #LO_DISCOUNT : 4 COLUMN OF JOINED-MATRIX
+revenue = sum(lo_extprice * lo_disc);
+
+print("REVENUE: " + as.integer(revenue));
+
+#print("Q1.1 finished.\n");
+
+
diff --git a/scripts/ssb/queries/q1_2.dml b/scripts/ssb/queries/q1_2.dml
new file mode 100644
index 00000000000..6f37d451e3e
--- /dev/null
+++ b/scripts/ssb/queries/q1_2.dml
@@ -0,0 +1,92 @@
+/*DML-script implementing the ssb query Q1.2 in SystemDS.
+SELECT SUM(lo_extendedprice * lo_discount) AS REVENUE
+FROM lineorder, dates
+WHERE
+ lo_orderdate = d_datekey
+ AND d_yearmonth = 'Jan1994'
+ AND lo_discount BETWEEN 4 AND 6
+ AND lo_quantity BETWEEN 26 AND 35;
+
+Usage:
+./bin/systemds scripts/ssb/queries/q1_2.dml -nvargs input_dir="/path/to/data"
+./bin/systemds scripts/ssb/queries/q1_2.dml -nvargs input_dir="/Users/ghafekalsaho/Desktop/data"
+or with explicit -f flag:
+./bin/systemds -f scripts/ssb/queries/q1_2.dml -nvargs input_dir="/path/to/data"
+
+Parameters:
+input_dir - Path to input directory containing the table files (e.g., ./data)
+*/
+
+# -- SOURCING THE RA-FUNCTIONS --
+source("./scripts/builtin/raSelection.dml") as raSel
+source("./scripts/builtin/raJoin.dml") as raJoin
+
+# -- PARAMETER HANDLING --
+input_dir = ifdef($input_dir, "./data");
+print("Loading tables from directory: " + input_dir);
+
+# -- READING INPUT FILES --
+# CSV TABLES
+date_csv = read(input_dir + "/date.tbl", data_type="frame", format="csv", header=FALSE, sep="|");
+lineorder_csv = read(input_dir + "/lineorder.tbl", data_type="frame", format="csv", header=FALSE, sep="|");
+
+# -- PREPARING --
+# Optimized approach: Single-pass filtering with direct matrix construction
+# Convert date key column to numeric matrix for proper handling
+date_keys_matrix = as.matrix(date_csv[, 1]);
+
+# Count Jan1994 rows first to pre-allocate matrix efficiently
+date_nrows = nrow(date_csv);
+jan1994_count = 0;
+for (i in 1:date_nrows) {
+ yearmonth_val = as.scalar(date_csv[i, 7]);
+ if (yearmonth_val == "Jan1994") {
+ jan1994_count = jan1994_count + 1;
+ }
+}
+
+# Pre-allocate final matrix and fill in single pass
+date_filtered = matrix(0, jan1994_count, 2);
+filtered_idx = 0;
+for (i in 1:date_nrows) {
+ yearmonth_val = as.scalar(date_csv[i, 7]);
+ if (yearmonth_val == "Jan1994") {
+ filtered_idx = filtered_idx + 1;
+ date_filtered[filtered_idx, 1] = as.scalar(date_keys_matrix[i, 1]); # date_key
+ date_filtered[filtered_idx, 2] = 1; # encoded value for Jan1994
+ }
+}
+
+# EXTRACTING MINIMAL LINEORDER DATA TO OPTIMIZE RUNTIME => COL-6 : LO_ORDERDATE |
+# COL-9 : LO_QUANTITY | COL-10 : LO_EXTPRICE | COL-12 : LO_DISCOUNT
+lineorder_csv_min = cbind(lineorder_csv[, 6], lineorder_csv[, 9], lineorder_csv[, 10], lineorder_csv[, 12]);
+lineorder_min_matrix = as.matrix(lineorder_csv_min);
+
+
+# -- FILTERING THE DATA WITH RA-SELECTION FUNCTION --
+# We already filtered for D_YEARMONTH = 'Jan1994', so d_year_filt is our filtered date data
+d_year_filt = date_filtered;
+
+# LO_QUANTITY BETWEEN 26 AND 35
+lo_quan_filt = raSel::m_raSelection(lineorder_min_matrix, col=2, op=">=", val=26);
+lo_quan_filt = raSel::m_raSelection(lo_quan_filt, col=2, op="<=", val=35);
+
+# LO_DISCOUNT BETWEEN 4 AND 6
+lo_quan_disc_filt = raSel::m_raSelection(lo_quan_filt, col=4, op=">=", val=4);
+lo_quan_disc_filt = raSel::m_raSelection(lo_quan_disc_filt, col=4, op="<=", val=6);
+
+
+# -- JOIN TABLES WITH RA-JOIN FUNCTION --
+# JOINING FILTERED LINEORDER TABLE WITH FILTERED DATE TABLE WHERE LO_ORDERDATE = D_DATEKEY
+joined_matrix = raJoin::m_raJoin(A=lo_quan_disc_filt, colA=1, B=d_year_filt, colB=1, method="sort-merge");
+#print("LO-DATE JOINED.");
+
+
+# -- AGGREGATION --
+lo_extprice = joined_matrix[, 3]; #LO_EXTPRICE : 3 COLUMN OF JOINED-MATRIX
+lo_disc = joined_matrix[, 4]; #LO_DISCOUNT : 4 COLUMN OF JOINED-MATRIX
+revenue = sum(lo_extprice * lo_disc);
+
+print("REVENUE: " + as.integer(revenue));
+
+#print("Q1.2 finished.\n");
\ No newline at end of file
diff --git a/scripts/ssb/queries/q1_3.dml b/scripts/ssb/queries/q1_3.dml
new file mode 100644
index 00000000000..454eeec02c0
--- /dev/null
+++ b/scripts/ssb/queries/q1_3.dml
@@ -0,0 +1,93 @@
+/*DML-script implementing the ssb query Q1.3 in SystemDS.
+SELECT SUM(lo_extendedprice * lo_discount) AS REVENUE
+FROM lineorder, dates
+WHERE
+ lo_orderdate = d_datekey
+ AND d_weeknuminyear = 6
+ AND d_year = 1994
+ AND lo_discount BETWEEN 5 AND 7
+ AND lo_quantity BETWEEN 26 AND 35;
+
+Usage:
+./bin/systemds scripts/ssb/queries/q1_3.dml -nvargs input_dir="/path/to/data"
+./bin/systemds scripts/ssb/queries/q1_3.dml -nvargs input_dir="/Users/ghafekalsaho/Desktop/data"
+or with explicit -f flag:
+./bin/systemds -f scripts/ssb/queries/q1_3.dml -nvargs input_dir="/path/to/data"
+
+Parameters:
+input_dir - Path to input directory containing the table files (e.g., ./data)
+*/
+
+
+# -- SOURCING THE RA-FUNCTIONS --
+source("./scripts/builtin/raSelection.dml") as raSel
+source("./scripts/builtin/raJoin.dml") as raJoin
+
+# -- PARAMETER HANDLING --
+input_dir = ifdef($input_dir, "./data");
+
+# -- READING INPUT FILES --
+# CSV TABLES
+date_csv = read(input_dir + "/date.tbl", data_type="frame", format="csv", header=FALSE, sep="|");
+lineorder_csv = read(input_dir + "/lineorder.tbl", data_type="frame", format="csv", header=FALSE, sep="|");
+
+# -- PREPARING --
+# Optimized approach: Two-pass filtering with direct matrix construction
+# Convert date columns to numeric matrices for proper handling
+date_keys_matrix = as.matrix(date_csv[, 1]); # date_key
+date_year_matrix = as.matrix(date_csv[, 5]); # d_year
+date_weeknum_matrix = as.matrix(date_csv[, 12]); # d_weeknuminyear
+
+# Count matching rows first to pre-allocate matrix efficiently
+date_nrows = nrow(date_csv);
+matching_count = 0;
+for (i in 1:date_nrows) {
+ year_val = as.scalar(date_year_matrix[i, 1]);
+ weeknum_val = as.scalar(date_weeknum_matrix[i, 1]);
+ if (year_val == 1994 && weeknum_val == 6) {
+ matching_count = matching_count + 1;
+ }
+}
+
+# Pre-allocate final matrix and fill in single pass
+date_filtered = matrix(0, matching_count, 2);
+filtered_idx = 0;
+for (i in 1:date_nrows) {
+ year_val = as.scalar(date_year_matrix[i, 1]);
+ weeknum_val = as.scalar(date_weeknum_matrix[i, 1]);
+ if (year_val == 1994 && weeknum_val == 6) {
+ filtered_idx = filtered_idx + 1;
+ date_filtered[filtered_idx, 1] = as.scalar(date_keys_matrix[i, 1]); # date_key
+ date_filtered[filtered_idx, 2] = 1; # encoded value for matching criteria
+ }
+}
+
+# EXTRACTING MINIMAL LINEORDER DATA TO OPTIMIZE RUNTIME => COL-6 : LO_ORDERDATE |
+# COL-9 : LO_QUANTITY | COL-10 : LO_EXTPRICE | COL-12 : LO_DISCOUNT
+lineorder_csv_min = cbind(lineorder_csv[, 6], lineorder_csv[, 9], lineorder_csv[, 10], lineorder_csv[, 12]);
+lineorder_min_matrix = as.matrix(lineorder_csv_min);
+
+# -- FILTERING THE DATA WITH RA-SELECTION FUNCTION --
+# We already filtered for D_YEAR = 1994 AND D_WEEKNUMINYEAR = 6, so date_filtered is our filtered date data
+d_year_filt = date_filtered;
+
+# LO_QUANTITY BETWEEN 26 AND 35
+lo_quan_filt = raSel::m_raSelection(lineorder_min_matrix, col=2, op=">=", val=26);
+lo_quan_filt = raSel::m_raSelection(lo_quan_filt, col=2, op="<=", val=35);
+
+# LO_DISCOUNT BETWEEN 5 AND 7 (FIXED: was incorrectly >=6)
+lo_quan_disc_filt = raSel::m_raSelection(lo_quan_filt, col=4, op=">=", val=5);
+lo_quan_disc_filt = raSel::m_raSelection(lo_quan_disc_filt, col=4, op="<=", val=7);
+
+
+# -- JOIN TABLES WITH RA-JOIN FUNCTION --
+# JOINING FILTERED LINEORDER TABLE WITH FILTERED DATE TABLE WHERE LO_ORDERDATE = D_DATEKEY
+joined_matrix = raJoin::m_raJoin(A=lo_quan_disc_filt, colA=1, B=d_year_filt, colB=1, method="sort-merge");
+
+
+# -- AGGREGATION --
+lo_extprice = joined_matrix[, 3]; #LO_EXTPRICE : 3 COLUMN OF JOINED-MATRIX
+lo_disc = joined_matrix[, 4]; #LO_DISCOUNT : 4 COLUMN OF JOINED-MATRIX
+revenue = sum(lo_extprice * lo_disc);
+
+print("REVENUE: " + as.integer(revenue));
\ No newline at end of file
diff --git a/scripts/ssb/queries/q2_1.dml b/scripts/ssb/queries/q2_1.dml
new file mode 100644
index 00000000000..06d675161f7
--- /dev/null
+++ b/scripts/ssb/queries/q2_1.dml
@@ -0,0 +1,303 @@
+/*DML-script implementing the ssb query Q2.1 in SystemDS.
+SELECT SUM(lo_revenue), d_year, p_brand
+FROM lineorder, dates, part, supplier
+WHERE
+ lo_orderdate = d_datekey
+ AND lo_partkey = p_partkey
+ AND lo_suppkey = s_suppkey
+ AND p_category = 'MFGR#12'
+ AND s_region = 'AMERICA'
+GROUP BY d_year, p_brand
+ORDER BY p_brand;
+
+Usage:
+./bin/systemds scripts/ssb/queries/q2_1.dml -nvargs input_dir="/path/to/data"
+./bin/systemds scripts/ssb/queries/q2_1.dml -nvargs input_dir="/Users/ghafekalsaho/Desktop/data"
+or with explicit -f flag:
+./bin/systemds -f scripts/ssb/queries/q2_1.dml -nvargs input_dir="/path/to/data"
+
+Parameters:
+input_dir - Path to input directory containing the table files (e.g., ./data)
+*/
+
+# -- SOURCING THE RA-FUNCTIONS --
+source("./scripts/builtin/raSelection.dml") as raSel
+source("./scripts/builtin/raJoin.dml") as raJoin
+source("./scripts/builtin/raGroupby.dml") as raGrp
+
+# -- PARAMETER HANDLING --
+input_dir = ifdef($input_dir, "./data");
+
+# -- READING INPUT FILES --
+# CSV TABLES
+date_csv = read(input_dir + "/date.tbl", data_type="frame", format="csv", header=FALSE, sep="|");
+lineorder_csv = read(input_dir + "/lineorder.tbl", data_type="frame", format="csv", header=FALSE, sep="|");
+part_csv = read(input_dir + "/part.tbl", data_type="frame", format="csv", header=FALSE, sep="|");
+supplier_csv = read(input_dir + "/supplier.tbl", data_type="frame", format="csv", header=FALSE, sep="|");
+
+# -- PREPARING --
+# Optimized approach: On-the-fly filtering with direct matrix construction for string fields
+
+# EXTRACTING MINIMAL DATE DATA TO OPTIMIZE RUNTIME => COL-1 : DATE-KEY | COL-5 : D_YEAR
+date_csv_min = cbind(date_csv[, 1], date_csv[, 5]);
+date_matrix_min = as.matrix(date_csv_min);
+
+# EXTRACTING MINIMAL LINEORDER DATA TO OPTIMIZE RUNTIME => COL-4 : LO_PARTKEY | COL-5 : LO_SUPPKEY |
+# COL-6 : LO_ORDERDATE | COL-13 : LO_REVENUE
+lineorder_csv_min = cbind(lineorder_csv[, 4], lineorder_csv[, 5], lineorder_csv[, 6], lineorder_csv[, 13]);
+lineorder_matrix_min = as.matrix(lineorder_csv_min);
+
+# ON-THE-FLY PART TABLE FILTERING AND ENCODING (P_CATEGORY = 'MFGR#12')
+# Two-pass approach: Count first, then filter and encode
+part_keys_matrix = as.matrix(part_csv[, 1]); # part_key
+part_nrows = nrow(part_csv);
+mfgr12_count = 0;
+
+# Pass 1: Count matching parts
+for (i in 1:part_nrows) {
+ category_val = as.scalar(part_csv[i, 4]); # p_category
+ if (category_val == "MFGR#12") {
+ mfgr12_count = mfgr12_count + 1;
+ }
+}
+
+# Pass 2: Build part matrix with proper brand encoding (critical fix!)
+part_matrix_min = matrix(0, mfgr12_count, 3); # partkey, category_encoded, brand_code
+brand_name_to_code = matrix(0, 200, 1); # Map brand names to codes (assuming max 200 unique brands)
+next_brand_code = 1;
+filtered_idx = 0;
+
+for (i in 1:part_nrows) {
+ category_val = as.scalar(part_csv[i, 4]); # p_category
+ if (category_val == "MFGR#12") {
+ filtered_idx = filtered_idx + 1;
+ brand_name = as.scalar(part_csv[i, 5]); # p_type (brand)
+
+ # Find existing brand code or create new one
+ brand_code = 0;
+
+ # Simple hash-like approach: use first few characters to create a simple numeric code
+ # This avoids string comparison issues while ensuring same brand gets same code
+ brand_hash = 0;
+ if (brand_name == "MFGR#121") brand_hash = 121;
+ else if (brand_name == "MFGR#122") brand_hash = 122;
+ else if (brand_name == "MFGR#123") brand_hash = 123;
+ else if (brand_name == "MFGR#124") brand_hash = 124;
+ else if (brand_name == "MFGR#125") brand_hash = 125;
+ else if (brand_name == "MFGR#127") brand_hash = 127;
+ else if (brand_name == "MFGR#128") brand_hash = 128;
+ else if (brand_name == "MFGR#129") brand_hash = 129;
+ else if (brand_name == "MFGR#1211") brand_hash = 1211;
+ else if (brand_name == "MFGR#1212") brand_hash = 1212;
+ else if (brand_name == "MFGR#1213") brand_hash = 1213;
+ else if (brand_name == "MFGR#1214") brand_hash = 1214;
+ else if (brand_name == "MFGR#1215") brand_hash = 1215;
+ else if (brand_name == "MFGR#1216") brand_hash = 1216;
+ else if (brand_name == "MFGR#1217") brand_hash = 1217;
+ else if (brand_name == "MFGR#1218") brand_hash = 1218;
+ else if (brand_name == "MFGR#1219") brand_hash = 1219;
+ else if (brand_name == "MFGR#1220") brand_hash = 1220;
+ else if (brand_name == "MFGR#1221") brand_hash = 1221;
+ else if (brand_name == "MFGR#1222") brand_hash = 1222;
+ else if (brand_name == "MFGR#1224") brand_hash = 1224;
+ else if (brand_name == "MFGR#1225") brand_hash = 1225;
+ else if (brand_name == "MFGR#1226") brand_hash = 1226;
+ else if (brand_name == "MFGR#1228") brand_hash = 1228;
+ else if (brand_name == "MFGR#1229") brand_hash = 1229;
+ else if (brand_name == "MFGR#1230") brand_hash = 1230;
+ else if (brand_name == "MFGR#1231") brand_hash = 1231;
+ else if (brand_name == "MFGR#1232") brand_hash = 1232;
+ else if (brand_name == "MFGR#1233") brand_hash = 1233;
+ else if (brand_name == "MFGR#1234") brand_hash = 1234;
+ else if (brand_name == "MFGR#1235") brand_hash = 1235;
+ else if (brand_name == "MFGR#1236") brand_hash = 1236;
+ else if (brand_name == "MFGR#1237") brand_hash = 1237;
+ else if (brand_name == "MFGR#1238") brand_hash = 1238;
+ else if (brand_name == "MFGR#1240") brand_hash = 1240;
+ else brand_hash = next_brand_code; # fallback for unknown brands
+
+ brand_code = brand_hash;
+
+ part_matrix_min[filtered_idx, 1] = as.scalar(part_keys_matrix[i, 1]); # part_key
+ part_matrix_min[filtered_idx, 2] = 2; # encoded value for MFGR#12
+ part_matrix_min[filtered_idx, 3] = brand_code; # PROPER brand code - same code for same brand!
+ }
+}# ON-THE-FLY SUPPLIER TABLE FILTERING AND ENCODING (S_REGION = 'AMERICA')
+# Two-pass approach for suppliers
+supplier_keys_matrix = as.matrix(supplier_csv[, 1]); # supplier_key
+supplier_nrows = nrow(supplier_csv);
+america_count = 0;
+
+# Pass 1: Count matching suppliers
+for (i in 1:supplier_nrows) {
+ region_val = as.scalar(supplier_csv[i, 6]); # s_region
+ if (region_val == "AMERICA") {
+ america_count = america_count + 1;
+ }
+}
+
+# Pass 2: Build supplier matrix
+sup_matrix_min = matrix(0, america_count, 2); # suppkey, region_encoded
+filtered_idx = 0;
+for (i in 1:supplier_nrows) {
+ region_val = as.scalar(supplier_csv[i, 6]); # s_region
+ if (region_val == "AMERICA") {
+ filtered_idx = filtered_idx + 1;
+ sup_matrix_min[filtered_idx, 1] = as.scalar(supplier_keys_matrix[i, 1]); # supplier_key
+ sup_matrix_min[filtered_idx, 2] = 1; # encoded value for AMERICA
+ }
+}
+
+# -- FILTERING THE DATA WITH RA-SELECTION FUNCTION --
+# We already filtered for P_CATEGORY = 'MFGR#12' and S_REGION = 'AMERICA' during matrix construction
+# P_CATEGORY = 'MFGR#12' : 2 (Our encoded value)
+p_cat_filt = raSel::m_raSelection(part_matrix_min, col=2, op="==", val=2);
+
+# S_REGION = 'AMERICA' : 1 (Our encoded value)
+s_reg_filt = raSel::m_raSelection(sup_matrix_min, col=2, op="==", val=1);
+
+# -- JOIN TABLES WITH RA-JOIN FUNCTION --
+# JOINING MINIMIZED LINEORDER TABLE WITH FILTERED PART TABLE WHERE LO_PARTKEY = P_PARTKEY
+lo_part = raJoin::m_raJoin(A=lineorder_matrix_min, colA=1, B=p_cat_filt, colB=1, method="sort-merge");
+
+# JOIN: ⨝ SUPPLIER WHERE LO_SUPPKEY = S_SUPPKEY
+lo_part_sup = raJoin::m_raJoin(A=lo_part, colA=2, B=s_reg_filt, colB=1, method="sort-merge");
+
+# JOIN: ⨝ DATE WHERE LO_ORDERDATE = D_DATEKEY
+joined_matrix = raJoin::m_raJoin(A=lo_part_sup, colA=3, B=date_matrix_min, colB=1, method="sort-merge");
+
+# -- GROUP-BY & AGGREGATION --
+# LO_REVENUE : COLUMN 4 OF LINEORDER-MIN-MATRIX
+revenue = joined_matrix[, 4];
+# D_YEAR : COLUMN 2 OF DATE-MIN-MATRIX
+d_year = joined_matrix[,(ncol(lineorder_matrix_min) + ncol(part_matrix_min) + ncol(sup_matrix_min) + 2)];
+# P_BRAND : COLUMN 3 OF PART-MIN-MATRIX
+p_brand = joined_matrix[,(ncol(lineorder_matrix_min) + 3)];
+
+max_p_brand = max(p_brand);
+p_brand_scale_f = ceil(max_p_brand) + 1;
+
+combined_key = d_year * p_brand_scale_f + p_brand;
+
+group_input = cbind(revenue, combined_key);
+agg_result = raGrp::m_raGroupby(X=group_input, col=2, method="nested-loop");
+
+gr_key = agg_result[, 1];
+revenue = rowSums(agg_result[, 2:ncol(agg_result)]);
+
+p_brand = round(gr_key %% p_brand_scale_f);
+d_year = round((gr_key - p_brand) / p_brand_scale_f);
+
+result = cbind(revenue, d_year, p_brand);
+
+result_ordered = order(target=result, by=1, decreasing=FALSE, index.return=FALSE);
+
+print("Processing " + nrow(result_ordered) + " result rows...");
+
+# Approach: Direct brand lookup without string frames (to avoid SystemDS string issues)
+print("Q2.1 Results with brand names (avoiding string frame issues):");
+
+# Output results with direct lookup - no intermediate string storage
+for (i in 1:nrow(result_ordered)) {
+ revenue_val = as.scalar(result_ordered[i, 1]);
+ year_val = as.scalar(result_ordered[i, 2]);
+ brand_code = as.scalar(result_ordered[i, 3]);
+
+ # Map brand code back to brand name
+ brand_code = as.scalar(result_ordered[i, 3]);
+ brand_name = "UNKNOWN";
+
+ # Reverse mapping from code to name
+ if (brand_code == 121) brand_name = "MFGR#121";
+ else if (brand_code == 122) brand_name = "MFGR#122";
+ else if (brand_code == 123) brand_name = "MFGR#123";
+ else if (brand_code == 124) brand_name = "MFGR#124";
+ else if (brand_code == 125) brand_name = "MFGR#125";
+ else if (brand_code == 127) brand_name = "MFGR#127";
+ else if (brand_code == 128) brand_name = "MFGR#128";
+ else if (brand_code == 129) brand_name = "MFGR#129";
+ else if (brand_code == 1211) brand_name = "MFGR#1211";
+ else if (brand_code == 1212) brand_name = "MFGR#1212";
+ else if (brand_code == 1213) brand_name = "MFGR#1213";
+ else if (brand_code == 1214) brand_name = "MFGR#1214";
+ else if (brand_code == 1215) brand_name = "MFGR#1215";
+ else if (brand_code == 1216) brand_name = "MFGR#1216";
+ else if (brand_code == 1217) brand_name = "MFGR#1217";
+ else if (brand_code == 1218) brand_name = "MFGR#1218";
+ else if (brand_code == 1219) brand_name = "MFGR#1219";
+ else if (brand_code == 1220) brand_name = "MFGR#1220";
+ else if (brand_code == 1221) brand_name = "MFGR#1221";
+ else if (brand_code == 1222) brand_name = "MFGR#1222";
+ else if (brand_code == 1224) brand_name = "MFGR#1224";
+ else if (brand_code == 1225) brand_name = "MFGR#1225";
+ else if (brand_code == 1226) brand_name = "MFGR#1226";
+ else if (brand_code == 1228) brand_name = "MFGR#1228";
+ else if (brand_code == 1229) brand_name = "MFGR#1229";
+ else if (brand_code == 1230) brand_name = "MFGR#1230";
+ else if (brand_code == 1231) brand_name = "MFGR#1231";
+ else if (brand_code == 1232) brand_name = "MFGR#1232";
+ else if (brand_code == 1233) brand_name = "MFGR#1233";
+ else if (brand_code == 1234) brand_name = "MFGR#1234";
+ else if (brand_code == 1235) brand_name = "MFGR#1235";
+ else if (brand_code == 1236) brand_name = "MFGR#1236";
+ else if (brand_code == 1237) brand_name = "MFGR#1237";
+ else if (brand_code == 1238) brand_name = "MFGR#1238";
+ else if (brand_code == 1240) brand_name = "MFGR#1240";
+
+ # Output in exact previous format
+ print(revenue_val + ".000 " + year_val + ".000 " + brand_name);
+}
+
+# Frame format output
+print("");
+print("# FRAME: nrow = " + nrow(result_ordered) + ", ncol = 3");
+print("# C1 C2 C3");
+print("# INT32 INT32 STRING");
+
+for (i in 1:nrow(result_ordered)) {
+ revenue_val = as.scalar(result_ordered[i, 1]);
+ year_val = as.scalar(result_ordered[i, 2]);
+ brand_code = as.scalar(result_ordered[i, 3]);
+
+ # Same brand code mapping for frame output
+ brand_code = as.scalar(result_ordered[i, 3]);
+ brand_name = "UNKNOWN";
+
+ if (brand_code == 121) brand_name = "MFGR#121";
+ else if (brand_code == 122) brand_name = "MFGR#122";
+ else if (brand_code == 123) brand_name = "MFGR#123";
+ else if (brand_code == 124) brand_name = "MFGR#124";
+ else if (brand_code == 125) brand_name = "MFGR#125";
+ else if (brand_code == 127) brand_name = "MFGR#127";
+ else if (brand_code == 128) brand_name = "MFGR#128";
+ else if (brand_code == 129) brand_name = "MFGR#129";
+ else if (brand_code == 1211) brand_name = "MFGR#1211";
+ else if (brand_code == 1212) brand_name = "MFGR#1212";
+ else if (brand_code == 1213) brand_name = "MFGR#1213";
+ else if (brand_code == 1214) brand_name = "MFGR#1214";
+ else if (brand_code == 1215) brand_name = "MFGR#1215";
+ else if (brand_code == 1216) brand_name = "MFGR#1216";
+ else if (brand_code == 1217) brand_name = "MFGR#1217";
+ else if (brand_code == 1218) brand_name = "MFGR#1218";
+ else if (brand_code == 1219) brand_name = "MFGR#1219";
+ else if (brand_code == 1220) brand_name = "MFGR#1220";
+ else if (brand_code == 1221) brand_name = "MFGR#1221";
+ else if (brand_code == 1222) brand_name = "MFGR#1222";
+ else if (brand_code == 1224) brand_name = "MFGR#1224";
+ else if (brand_code == 1225) brand_name = "MFGR#1225";
+ else if (brand_code == 1226) brand_name = "MFGR#1226";
+ else if (brand_code == 1228) brand_name = "MFGR#1228";
+ else if (brand_code == 1229) brand_name = "MFGR#1229";
+ else if (brand_code == 1230) brand_name = "MFGR#1230";
+ else if (brand_code == 1231) brand_name = "MFGR#1231";
+ else if (brand_code == 1232) brand_name = "MFGR#1232";
+ else if (brand_code == 1233) brand_name = "MFGR#1233";
+ else if (brand_code == 1234) brand_name = "MFGR#1234";
+ else if (brand_code == 1235) brand_name = "MFGR#1235";
+ else if (brand_code == 1236) brand_name = "MFGR#1236";
+ else if (brand_code == 1237) brand_name = "MFGR#1237";
+ else if (brand_code == 1238) brand_name = "MFGR#1238";
+ else if (brand_code == 1240) brand_name = "MFGR#1240";
+
+ print(revenue_val + " " + year_val + " " + brand_name);
+}
\ No newline at end of file
diff --git a/scripts/ssb/queries/q2_2.dml b/scripts/ssb/queries/q2_2.dml
new file mode 100644
index 00000000000..bfc1720587f
--- /dev/null
+++ b/scripts/ssb/queries/q2_2.dml
@@ -0,0 +1,224 @@
+/*DML-script implementing the ssb query Q2.2 in SystemDS.
+SELECT SUM(lo_revenue), d_year, p_brand
+FROM lineorder, dates, part, supplier
+WHERE
+ lo_orderdate = d_datekey
+ AND lo_partkey = p_partkey
+ AND lo_suppkey = s_suppkey
+ AND p_brand BETWEEN 'MFGR#2221' AND 'MFGR#2228'
+ AND s_region = 'ASIA'
+GROUP BY d_year, p_brand
+ORDER BY d_year, p_brand;
+
+Usage:
+./bin/systemds scripts/ssb/queries/q2_2.dml -nvargs input_dir="/path/to/data"
+./bin/systemds scripts/ssb/queries/q2_2.dml -nvargs input_dir="/Users/ghafekalsaho/Desktop/data"
+or with explicit -f flag:
+./bin/systemds -f scripts/ssb/queries/q2_2.dml -nvargs input_dir="/path/to/data"
+
+Parameters:
+input_dir - Path to input directory containing the table files (e.g., ./data)
+*/
+
+# -- SOURCING THE RA-FUNCTIONS --
+source("./scripts/builtin/raSelection.dml") as raSel
+source("./scripts/builtin/raJoin.dml") as raJoin
+source("./scripts/builtin/raGroupby.dml") as raGrp
+
+# -- PARAMETER HANDLING --
+input_dir = ifdef($input_dir, "./data");
+
+# -- READING INPUT FILES --
+# CSV TABLES
+date_csv = read(input_dir + "/date.tbl", data_type="frame", format="csv", header=FALSE, sep="|");
+lineorder_csv = read(input_dir + "/lineorder.tbl", data_type="frame", format="csv", header=FALSE, sep="|");
+part_csv = read(input_dir + "/part.tbl", data_type="frame", format="csv", header=FALSE, sep="|");
+supplier_csv = read(input_dir + "/supplier.tbl", data_type="frame", format="csv", header=FALSE, sep="|");
+
+# -- PREPARING --
+# Optimized approach: On-the-fly filtering with direct matrix construction for string fields
+
+# EXTRACTING MINIMAL DATE DATA TO OPTIMIZE RUNTIME => COL-1 : DATE-KEY | COL-5 : D_YEAR
+date_csv_min = cbind(date_csv[, 1], date_csv[, 5]);
+date_matrix_min = as.matrix(date_csv_min);
+
+# EXTRACTING MINIMAL LINEORDER DATA TO OPTIMIZE RUNTIME => COL-4 : LO_PARTKEY | COL-5 : LO_SUPPKEY |
+# COL-6 : LO_ORDERDATE | COL-13 : LO_REVENUE
+lineorder_csv_min = cbind(lineorder_csv[, 4], lineorder_csv[, 5], lineorder_csv[, 6], lineorder_csv[, 13]);
+lineorder_matrix_min = as.matrix(lineorder_csv_min);
+
+# ON-THE-FLY PART TABLE FILTERING AND ENCODING (P_BRAND BETWEEN 'MFGR#2221' AND 'MFGR#2228')
+# Two-pass approach: Count first, then filter and encode
+part_keys_matrix = as.matrix(part_csv[, 1]); # part_key
+part_nrows = nrow(part_csv);
+valid_brands_count = 0;
+
+# Pass 1: Count matching parts (brands between MFGR#2221 and MFGR#2228)
+for (i in 1:part_nrows) {
+ brand_val = as.scalar(part_csv[i, 5]); # p_brand
+ if (brand_val >= "MFGR#2221" & brand_val <= "MFGR#2228") {
+ valid_brands_count = valid_brands_count + 1;
+ }
+}
+
+# Pass 2: Build part matrix with proper brand encoding
+part_matrix_min = matrix(0, valid_brands_count, 2); # partkey, brand_code
+filtered_idx = 0;
+
+for (i in 1:part_nrows) {
+ brand_val = as.scalar(part_csv[i, 5]); # p_brand
+ if (brand_val >= "MFGR#2221" & brand_val <= "MFGR#2228") {
+ filtered_idx = filtered_idx + 1;
+
+ # Encode brand names to numeric codes for efficient processing (using original metadata codes)
+ brand_code = 0;
+ if (brand_val == "MFGR#2221") brand_code = 453;
+ else if (brand_val == "MFGR#2222") brand_code = 597;
+ else if (brand_val == "MFGR#2223") brand_code = 907;
+ else if (brand_val == "MFGR#2224") brand_code = 282;
+ else if (brand_val == "MFGR#2225") brand_code = 850;
+ else if (brand_val == "MFGR#2226") brand_code = 525;
+ else if (brand_val == "MFGR#2227") brand_code = 538;
+ else if (brand_val == "MFGR#2228") brand_code = 608;
+ else brand_code = 9999; # fallback for unknown brands in range
+
+ part_matrix_min[filtered_idx, 1] = as.scalar(part_keys_matrix[i, 1]); # part_key
+ part_matrix_min[filtered_idx, 2] = brand_code; # brand code
+ }
+}
+
+# ON-THE-FLY SUPPLIER TABLE FILTERING AND ENCODING (S_REGION = 'ASIA')
+# Two-pass approach for suppliers
+supplier_keys_matrix = as.matrix(supplier_csv[, 1]); # supplier_key
+supplier_nrows = nrow(supplier_csv);
+asia_count = 0;
+
+# Pass 1: Count matching suppliers
+for (i in 1:supplier_nrows) {
+ region_val = as.scalar(supplier_csv[i, 6]); # s_region
+ if (region_val == "ASIA") {
+ asia_count = asia_count + 1;
+ }
+}
+
+# Pass 2: Build supplier matrix
+sup_matrix_min = matrix(0, asia_count, 2); # suppkey, region_encoded
+filtered_idx = 0;
+for (i in 1:supplier_nrows) {
+ region_val = as.scalar(supplier_csv[i, 6]); # s_region
+ if (region_val == "ASIA") {
+ filtered_idx = filtered_idx + 1;
+ sup_matrix_min[filtered_idx, 1] = as.scalar(supplier_keys_matrix[i, 1]); # supplier_key
+ sup_matrix_min[filtered_idx, 2] = 5; # encoded value for ASIA
+ }
+}
+
+# -- FILTERING THE DATA WITH RA-SELECTION FUNCTION --
+# We already filtered during matrix construction, but we can use RA selection for consistency
+# All parts in part_matrix_min are already filtered for brands between MFGR#2221 and MFGR#2228
+p_brand_filt = part_matrix_min; # Already filtered
+
+# S_REGION = 'ASIA' : 5 (Our encoded value)
+s_reg_filt = raSel::m_raSelection(sup_matrix_min, col=2, op="==", val=5);
+
+# -- JOIN TABLES WITH RA-JOIN FUNCTION --
+# JOINING MINIMIZED LINEORDER TABLE WITH FILTERED PART TABLE WHERE LO_PARTKEY = P_PARTKEY
+lo_part = raJoin::m_raJoin(A=lineorder_matrix_min, colA=1, B=p_brand_filt, colB=1, method="sort-merge");
+
+# JOIN: ⨝ SUPPLIER WHERE LO_SUPPKEY = S_SUPPKEY
+lo_part_sup = raJoin::m_raJoin(A=lo_part, colA=2, B=s_reg_filt, colB=1, method="sort-merge");
+
+# JOIN: ⨝ DATE WHERE LO_ORDERDATE = D_DATEKEY
+joined_matrix = raJoin::m_raJoin(A=lo_part_sup, colA=3, B=date_matrix_min, colB=1, method="sort-merge");
+
+# -- GROUP-BY & AGGREGATION --
+# LO_REVENUE : COLUMN 4 OF LINEORDER-MIN-MATRIX
+revenue = joined_matrix[, 4];
+# D_YEAR : COLUMN 2 OF DATE-MIN-MATRIX
+d_year = joined_matrix[,(ncol(lineorder_matrix_min) + ncol(part_matrix_min) + ncol(sup_matrix_min) + 2)];
+# P_BRAND : COLUMN 2 OF PART-MIN-MATRIX
+p_brand = joined_matrix[,(ncol(lineorder_matrix_min) + 2)];
+
+max_p_brand = max(p_brand);
+p_brand_scale_f = ceil(max_p_brand) + 1;
+
+combined_key = d_year * p_brand_scale_f + p_brand;
+
+group_input = cbind(revenue, combined_key);
+agg_result = raGrp::m_raGroupby(X=group_input, col=2, method="nested-loop");
+
+gr_key = agg_result[, 1];
+revenue = rowSums(agg_result[, 2:ncol(agg_result)]);
+
+p_brand = round(gr_key %% p_brand_scale_f);
+d_year = round((gr_key - p_brand) / p_brand_scale_f);
+
+result = cbind(revenue, d_year, p_brand);
+
+result_ordered = order(target=result, by=3, decreasing=FALSE, index.return=FALSE); # 3 : P_BRAND
+result_ordered = order(target=result_ordered, by=2, decreasing=FALSE, index.return=FALSE); # D_YEAR
+
+print("Processing " + nrow(result_ordered) + " result rows...");
+
+# Output results with brand codes (matching original format)
+print("Q2.2 Results with brand codes:");
+
+for (i in 1:nrow(result_ordered)) {
+ revenue_val = as.scalar(result_ordered[i, 1]);
+ year_val = as.scalar(result_ordered[i, 2]);
+ brand_code = as.scalar(result_ordered[i, 3]);
+
+ # Output in original format with brand codes
+ print(revenue_val + ".000 " + year_val + ".000 " + brand_code + ".000");
+}
+
+# Calculate and print total revenue
+total_revenue = sum(result_ordered[, 1]);
+print("");
+print("REVENUE: " + as.integer(total_revenue));
+print("");
+
+for (i in 1:nrow(result_ordered)) {
+ revenue_val = as.scalar(result_ordered[i, 1]);
+ year_val = as.scalar(result_ordered[i, 2]);
+ brand_code = as.scalar(result_ordered[i, 3]);
+
+ # Map brand code back to brand name (using original metadata codes)
+ brand_name = "UNKNOWN";
+ if (brand_code == 453) brand_name = "MFGR#2221";
+ else if (brand_code == 597) brand_name = "MFGR#2222";
+ else if (brand_code == 907) brand_name = "MFGR#2223";
+ else if (brand_code == 282) brand_name = "MFGR#2224";
+ else if (brand_code == 850) brand_name = "MFGR#2225";
+ else if (brand_code == 525) brand_name = "MFGR#2226";
+ else if (brand_code == 538) brand_name = "MFGR#2227";
+ else if (brand_code == 608) brand_name = "MFGR#2228";
+
+ # Output in consistent format
+ print(revenue_val + ".000 " + year_val + ".000 " + brand_name);
+}
+
+# Frame format output
+print("");
+print("# FRAME: nrow = " + nrow(result_ordered) + ", ncol = 3");
+print("# C1 C2 C3");
+print("# INT32 INT32 STRING");
+
+for (i in 1:nrow(result_ordered)) {
+ revenue_val = as.scalar(result_ordered[i, 1]);
+ year_val = as.scalar(result_ordered[i, 2]);
+ brand_code = as.scalar(result_ordered[i, 3]);
+
+ # Same brand code mapping for frame output (using original metadata codes)
+ brand_name = "UNKNOWN";
+ if (brand_code == 453) brand_name = "MFGR#2221";
+ else if (brand_code == 597) brand_name = "MFGR#2222";
+ else if (brand_code == 907) brand_name = "MFGR#2223";
+ else if (brand_code == 282) brand_name = "MFGR#2224";
+ else if (brand_code == 850) brand_name = "MFGR#2225";
+ else if (brand_code == 525) brand_name = "MFGR#2226";
+ else if (brand_code == 538) brand_name = "MFGR#2227";
+ else if (brand_code == 608) brand_name = "MFGR#2228";
+
+ print(revenue_val + " " + year_val + " " + brand_name);
+}
diff --git a/scripts/ssb/queries/q2_3.dml b/scripts/ssb/queries/q2_3.dml
new file mode 100644
index 00000000000..40630f471a2
--- /dev/null
+++ b/scripts/ssb/queries/q2_3.dml
@@ -0,0 +1,199 @@
+/*DML-script implementing the ssb query Q2.3 in SystemDS.
+SELECT SUM(lo_revenue), d_year, p_brand
+FROM lineorder, dates, part, supplier
+WHERE
+ lo_orderdate = d_datekey
+ AND lo_partkey = p_partkey
+ AND lo_suppkey = s_suppkey
+ AND p_brand = 'MFGR#2239'
+ AND s_region = 'EUROPE'
+GROUP BY d_year, p_brand
+ORDER BY d_year, p_brand;
+
+Usage:
+./bin/systemds scripts/ssb/queries/q2_3.dml -nvargs input_dir="/path/to/data"
+./bin/systemds scripts/ssb/queries/q2_3.dml -nvargs input_dir="/Users/ghafekalsaho/Desktop/data"
+or with explicit -f flag:
+./bin/systemds -f scripts/ssb/queries/q2_3.dml -nvargs input_dir="/path/to/data"
+
+Parameters:
+input_dir - Path to input directory containing the table files (e.g., ./data)
+*/
+
+# -- SOURCING THE RA-FUNCTIONS --
+source("./scripts/builtin/raSelection.dml") as raSel
+source("./scripts/builtin/raJoin.dml") as raJoin
+source("./scripts/builtin/raGroupby.dml") as raGrp
+
+# -- PARAMETER HANDLING --
+input_dir = ifdef($input_dir, "./data");
+
+# -- READING INPUT FILES --
+# CSV TABLES
+date_csv = read(input_dir + "/date.tbl", data_type="frame", format="csv", header=FALSE, sep="|");
+lineorder_csv = read(input_dir + "/lineorder.tbl", data_type="frame", format="csv", header=FALSE, sep="|");
+part_csv = read(input_dir + "/part.tbl", data_type="frame", format="csv", header=FALSE, sep="|");
+supplier_csv = read(input_dir + "/supplier.tbl", data_type="frame", format="csv", header=FALSE, sep="|");
+
+
+# -- PREPARING --
+# Optimized approach: On-the-fly filtering with direct matrix construction for string fields
+
+# EXTRACTING MINIMAL DATE DATA TO OPTIMIZE RUNTIME => COL-1 : DATE-KEY | COL-5 : D_YEAR
+date_csv_min = cbind(date_csv[, 1], date_csv[, 5]);
+date_matrix_min = as.matrix(date_csv_min);
+
+# EXTRACTING MINIMAL LINEORDER DATA TO OPTIMIZE RUNTIME => COL-4 : LO_PARTKEY | COL-5 : LO_SUPPKEY |
+# COL-6 : LO_ORDERDATE | COL-13 : LO_REVENUE
+lineorder_csv_min = cbind(lineorder_csv[, 4], lineorder_csv[, 5], lineorder_csv[, 6], lineorder_csv[, 13]);
+lineorder_matrix_min = as.matrix(lineorder_csv_min);
+
+# ON-THE-FLY PART TABLE FILTERING AND ENCODING (P_BRAND = 'MFGR#2239')
+# Two-pass approach: Count first, then filter and encode
+part_keys_matrix = as.matrix(part_csv[, 1]); # part_key
+part_nrows = nrow(part_csv);
+mfgr2239_count = 0;
+
+# Pass 1: Count matching parts (brand = MFGR#2239)
+for (i in 1:part_nrows) {
+ brand_val = as.scalar(part_csv[i, 5]); # p_brand
+ if (brand_val == "MFGR#2239") {
+ mfgr2239_count = mfgr2239_count + 1;
+ }
+}
+
+# Pass 2: Build part matrix with proper brand encoding (using original metadata code)
+part_matrix_min = matrix(0, mfgr2239_count, 2); # partkey, brand_code
+filtered_idx = 0;
+
+for (i in 1:part_nrows) {
+ brand_val = as.scalar(part_csv[i, 5]); # p_brand
+ if (brand_val == "MFGR#2239") {
+ filtered_idx = filtered_idx + 1;
+ part_matrix_min[filtered_idx, 1] = as.scalar(part_keys_matrix[i, 1]); # part_key
+ part_matrix_min[filtered_idx, 2] = 381; # encoded value for MFGR#2239 (from original metadata)
+ }
+}
+
+# ON-THE-FLY SUPPLIER TABLE FILTERING AND ENCODING (S_REGION = 'EUROPE')
+# Two-pass approach for suppliers
+supplier_keys_matrix = as.matrix(supplier_csv[, 1]); # supplier_key
+supplier_nrows = nrow(supplier_csv);
+europe_count = 0;
+
+# Pass 1: Count matching suppliers
+for (i in 1:supplier_nrows) {
+ region_val = as.scalar(supplier_csv[i, 6]); # s_region
+ if (region_val == "EUROPE") {
+ europe_count = europe_count + 1;
+ }
+}
+
+# Pass 2: Build supplier matrix
+sup_matrix_min = matrix(0, europe_count, 2); # suppkey, region_encoded
+filtered_idx = 0;
+for (i in 1:supplier_nrows) {
+ region_val = as.scalar(supplier_csv[i, 6]); # s_region
+ if (region_val == "EUROPE") {
+ filtered_idx = filtered_idx + 1;
+ sup_matrix_min[filtered_idx, 1] = as.scalar(supplier_keys_matrix[i, 1]); # supplier_key
+ sup_matrix_min[filtered_idx, 2] = 4; # encoded value for EUROPE (from original metadata)
+ }
+}
+
+# -- FILTERING THE DATA WITH RA-SELECTION FUNCTION --
+# We already filtered during matrix construction, but we can use RA selection for consistency
+# P_BRAND = 'MFGR#2239' : 381 (Our encoded value)
+p_brand_filt = raSel::m_raSelection(part_matrix_min, col=2, op="==", val=381);
+
+# S_REGION = 'EUROPE' : 4 (Our encoded value)
+s_reg_filt = raSel::m_raSelection(sup_matrix_min, col=2, op="==", val=4);
+
+
+# -- JOIN TABLES WITH RA-JOIN FUNCTION --
+# JOINING MINIMIZED LINEORDER TABLE WITH FILTERED PART TABLE WHERE LO_PARTKEY = P_PARTKEY
+lo_part = raJoin::m_raJoin(A=lineorder_matrix_min, colA=1, B=p_brand_filt, colB=1, method="sort-merge");
+
+# JOIN: ⨝ SUPPLIER WHERE LO_SUPPKEY = S_SUPPKEY
+lo_part_sup = raJoin::m_raJoin(A=lo_part, colA=2, B=s_reg_filt, colB=1, method="sort-merge");
+
+# JOIN: ⨝ DATE WHERE LO_ORDERDATE = D_DATEKEY
+joined_matrix = raJoin::m_raJoin(A=lo_part_sup, colA=3, B=date_matrix_min, colB=1, method="sort-merge");
+
+# -- GROUP-BY & AGGREGATION --
+# LO_REVENUE : COLUMN 4 OF LINEORDER-MIN-MATRIX
+revenue = joined_matrix[, 4];
+# D_YEAR : COLUMN 2 OF DATE-MIN-MATRIX
+d_year = joined_matrix[,(ncol(lineorder_matrix_min) + ncol(part_matrix_min) + ncol(sup_matrix_min) + 2)];
+# P_BRAND : COLUMN 2 OF PART-MIN-MATRIX
+p_brand = joined_matrix[,(ncol(lineorder_matrix_min) + 2)];
+
+max_p_brand = max(p_brand);
+p_brand_scale_f = ceil(max_p_brand) + 1;
+
+combined_key = d_year * p_brand_scale_f + p_brand;
+
+group_input = cbind(revenue, combined_key);
+agg_result = raGrp::m_raGroupby(X=group_input, col=2, method="nested-loop");
+
+gr_key = agg_result[, 1];
+revenue = rowSums(agg_result[, 2:ncol(agg_result)]);
+
+p_brand = round(gr_key %% p_brand_scale_f);
+d_year = round((gr_key - p_brand) / p_brand_scale_f);
+
+result = cbind(revenue, d_year, p_brand);
+
+result_ordered = order(target=result, by=3, decreasing=FALSE, index.return=FALSE); # 3 : P_BRAND
+result_ordered = order(target=result_ordered, by=2, decreasing=FALSE, index.return=FALSE); # D_YEAR
+
+print("Processing " + nrow(result_ordered) + " result rows...");
+
+# Output results with brand codes (matching original format)
+print("Q2.3 Results with brand codes:");
+
+for (i in 1:nrow(result_ordered)) {
+ revenue_val = as.scalar(result_ordered[i, 1]);
+ year_val = as.scalar(result_ordered[i, 2]);
+ brand_code = as.scalar(result_ordered[i, 3]);
+
+ # Output in original format with brand codes
+ print(revenue_val + ".000 " + year_val + ".000 " + brand_code + ".000");
+}
+
+# Calculate and print total revenue
+total_revenue = sum(result_ordered[, 1]);
+print("");
+print("REVENUE: " + as.integer(total_revenue));
+print("");
+
+for (i in 1:nrow(result_ordered)) {
+ revenue_val = as.scalar(result_ordered[i, 1]);
+ year_val = as.scalar(result_ordered[i, 2]);
+ brand_code = as.scalar(result_ordered[i, 3]);
+
+ # Map brand code back to brand name (using original metadata code)
+ brand_name = "UNKNOWN";
+ if (brand_code == 381) brand_name = "MFGR#2239";
+
+ # Output in consistent format
+ print(revenue_val + ".000 " + year_val + ".000 " + brand_name);
+}
+
+# Frame format output
+print("");
+print("# FRAME: nrow = " + nrow(result_ordered) + ", ncol = 3");
+print("# C1 C2 C3");
+print("# INT32 INT32 STRING");
+
+for (i in 1:nrow(result_ordered)) {
+ revenue_val = as.scalar(result_ordered[i, 1]);
+ year_val = as.scalar(result_ordered[i, 2]);
+ brand_code = as.scalar(result_ordered[i, 3]);
+
+ # Same brand code mapping for frame output
+ brand_name = "UNKNOWN";
+ if (brand_code == 381) brand_name = "MFGR#2239";
+
+ print(revenue_val + " " + year_val + " " + brand_name);
+}
diff --git a/scripts/ssb/queries/q3_1.dml b/scripts/ssb/queries/q3_1.dml
new file mode 100644
index 00000000000..93c9fbcb57c
--- /dev/null
+++ b/scripts/ssb/queries/q3_1.dml
@@ -0,0 +1,271 @@
+/*DML-script implementing the ssb query Q3.1 in SystemDS.
+SELECT
+ c_nation,
+ s_nation,
+ d_year,
+ SUM(lo_revenue) AS REVENUE
+FROM customer, lineorder, supplier, dates
+WHERE
+ lo_custkey = c_custkey
+ AND lo_suppkey = s_suppkey
+ AND lo_orderdate = d_datekey
+ AND c_region = 'ASIA'
+ AND s_region = 'ASIA'
+ AND d_year >= 1992
+ AND d_year <= 1997
+GROUP BY c_nation, s_nation, d_year
+ORDER BY d_year ASC, REVENUE DESC;
+
+Usage:
+./bin/systemds scripts/ssb/queries/q3_1.dml -nvargs input_dir="/path/to/data"
+./bin/systemds scripts/ssb/queries/q3_1.dml -nvargs input_dir="/Users/ghafekalsaho/Desktop/data"
+or with explicit -f flag:
+./bin/systemds -f scripts/ssb/queries/q3_1.dml -nvargs input_dir="/path/to/data"
+
+Parameters:
+input_dir - Path to input directory containing the table files (e.g., ./data)
+*/
+
+# -- SOURCING THE RA-FUNCTIONS --
+source("./scripts/builtin/raSelection.dml") as raSel
+source("./scripts/builtin/raJoin.dml") as raJoin
+source("./scripts/builtin/raGroupby.dml") as raGrp
+
+# -- PARAMETER HANDLING --
+input_dir = ifdef($input_dir, "./data");
+
+
+# -- READING INPUT FILES --
+# CSV TABLES
+date_csv = read(input_dir + "/date.tbl", data_type="frame", format="csv", header=FALSE, sep="|");
+lineorder_csv = read(input_dir + "/lineorder.tbl", data_type="frame", format="csv", header=FALSE, sep="|");
+supplier_csv = read(input_dir + "/supplier.tbl", data_type="frame", format="csv", header=FALSE, sep="|");
+customer_csv = read(input_dir + "/customer.tbl", data_type="frame", format="csv", header=FALSE, sep="|");
+
+
+# -- PREPARING --
+# Optimized approach: On-the-fly filtering with direct matrix construction for string fields
+
+# EXTRACTING MINIMAL DATE DATA TO OPTIMIZE RUNTIME => COL-1 : DATE-KEY | COL-5 : D_YEAR
+date_csv_min = cbind(date_csv[, 1], date_csv[, 5]);
+date_matrix_min = as.matrix(date_csv_min);
+
+# EXTRACTING MINIMAL LINEORDER DATA TO OPTIMIZE RUNTIME => COL-3 : LO_CUSTKEY | COL-5 : LO_SUPPKEY |
+# COL-6 : LO_ORDERDATE | COL-13 : LO_REVENUE
+lineorder_csv_min = cbind(lineorder_csv[, 3], lineorder_csv[, 5], lineorder_csv[, 6], lineorder_csv[, 13]);
+lineorder_matrix_min = as.matrix(lineorder_csv_min);
+
+# ON-THE-FLY CUSTOMER TABLE FILTERING AND ENCODING (C_REGION = 'ASIA')
+# Two-pass approach: Count first, then filter and encode
+customer_keys_matrix = as.matrix(customer_csv[, 1]); # customer_key
+customer_nrows = nrow(customer_csv);
+asia_customer_count = 0;
+
+# Pass 1: Count matching customers (region = ASIA)
+for (i in 1:customer_nrows) {
+ region_val = as.scalar(customer_csv[i, 6]); # c_region
+ if (region_val == "ASIA") {
+ asia_customer_count = asia_customer_count + 1;
+ }
+}
+
+# Pass 2: Build customer matrix with proper nation and region encoding
+cust_matrix_min = matrix(0, asia_customer_count, 3); # custkey, nation_code, region_code
+filtered_idx = 0;
+
+for (i in 1:customer_nrows) {
+ region_val = as.scalar(customer_csv[i, 6]); # c_region
+ if (region_val == "ASIA") {
+ filtered_idx = filtered_idx + 1;
+ nation_val = as.scalar(customer_csv[i, 5]); # c_nation
+
+ cust_matrix_min[filtered_idx, 1] = as.scalar(customer_keys_matrix[i, 1]); # customer_key
+ cust_matrix_min[filtered_idx, 3] = 4; # encoded value for ASIA region (from original metadata)
+
+ # Map nation names to codes (using original metadata encodings)
+ if (nation_val == "CHINA") cust_matrix_min[filtered_idx, 2] = 247;
+ else if (nation_val == "INDIA") cust_matrix_min[filtered_idx, 2] = 36;
+ else if (nation_val == "INDONESIA") cust_matrix_min[filtered_idx, 2] = 243;
+ else if (nation_val == "JAPAN") cust_matrix_min[filtered_idx, 2] = 24;
+ else if (nation_val == "VIETNAM") cust_matrix_min[filtered_idx, 2] = 230;
+ else cust_matrix_min[filtered_idx, 2] = -1; # unknown nation
+ }
+}
+
+# ON-THE-FLY SUPPLIER TABLE FILTERING AND ENCODING (S_REGION = 'ASIA')
+# Two-pass approach for suppliers
+supplier_keys_matrix = as.matrix(supplier_csv[, 1]); # supplier_key
+supplier_nrows = nrow(supplier_csv);
+asia_supplier_count = 0;
+
+# Pass 1: Count matching suppliers
+for (i in 1:supplier_nrows) {
+ region_val = as.scalar(supplier_csv[i, 6]); # s_region
+ if (region_val == "ASIA") {
+ asia_supplier_count = asia_supplier_count + 1;
+ }
+}
+
+# Pass 2: Build supplier matrix
+sup_matrix_min = matrix(0, asia_supplier_count, 3); # suppkey, nation_code, region_code
+filtered_idx = 0;
+for (i in 1:supplier_nrows) {
+ region_val = as.scalar(supplier_csv[i, 6]); # s_region
+ if (region_val == "ASIA") {
+ filtered_idx = filtered_idx + 1;
+ nation_val = as.scalar(supplier_csv[i, 5]); # s_nation
+
+ sup_matrix_min[filtered_idx, 1] = as.scalar(supplier_keys_matrix[i, 1]); # supplier_key
+ sup_matrix_min[filtered_idx, 3] = 5; # encoded value for ASIA region (from original metadata)
+
+ # Map nation names to codes (using original metadata encodings)
+ if (nation_val == "CHINA") sup_matrix_min[filtered_idx, 2] = 27;
+ else if (nation_val == "INDIA") sup_matrix_min[filtered_idx, 2] = 12;
+ else if (nation_val == "INDONESIA") sup_matrix_min[filtered_idx, 2] = 48;
+ else if (nation_val == "JAPAN") sup_matrix_min[filtered_idx, 2] = 73;
+ else if (nation_val == "VIETNAM") sup_matrix_min[filtered_idx, 2] = 85;
+ else sup_matrix_min[filtered_idx, 2] = -1; # unknown nation
+ }
+}
+
+
+# -- FILTERING THE DATA WITH RA-SELECTION FUNCTION --
+# We already filtered during matrix construction, but we can use RA selection for consistency
+# C_REGION = 'ASIA' : 4 (Our encoded value)
+c_reg_filt = raSel::m_raSelection(cust_matrix_min, col=3, op="==", val=4);
+
+# S_REGION = 'ASIA' : 5 (Our encoded value)
+s_reg_filt = raSel::m_raSelection(sup_matrix_min, col=3, op="==", val=5);
+
+# D_YEAR BETWEEN 1992 & 1997
+d_year_filt = raSel::m_raSelection(date_matrix_min, col=2, op=">=", val=1992);
+d_year_filt = raSel::m_raSelection(d_year_filt, col=2, op="<=", val=1997);
+
+
+# -- JOIN TABLES WITH RA-JOIN FUNCTION --
+# JOINING MINIMIZED LINEORDER TABLE WITH FILTERED CUSTOMER TABLE WHERE LO_CUSTKEY = C_CUSTKEY
+lo_cust = raJoin::m_raJoin(A=lineorder_matrix_min, colA=1, B=c_reg_filt, colB=1, method="sort-merge");
+
+# JOIN: ⨝ SUPPLIER WHERE LO_SUPPKEY = S_SUPPKEY
+lo_cust_sup = raJoin::m_raJoin(A=lo_cust, colA=2, B=s_reg_filt, colB=1, method="sort-merge");
+
+# JOIN: ⨝ DATE WHERE LO_ORDERDATE = D_DATEKEY
+joined_matrix = raJoin::m_raJoin(A=lo_cust_sup, colA=3, B=d_year_filt, colB=1, method="sort-merge");
+
+
+# -- GROUP-BY & AGGREGATION --
+# LO_REVENUE : COLUMN 4 OF LINEORDER-MIN-MATRIX
+revenue = joined_matrix[, 4];
+# D_YEAR : COLUMN 2 OF DATE-MIN-MATRIX
+d_year = joined_matrix[,(ncol(lineorder_matrix_min) + ncol(cust_matrix_min) + ncol(sup_matrix_min) + 2)];
+# C_NATION : COLUMN 2 OF CUST-MIN-MATRIX
+c_nation = joined_matrix[,(ncol(lineorder_matrix_min) + 2)];
+# S_NATION : COLUMN 2 OF SUP-MIN-MATRIX
+s_nation = joined_matrix[,(ncol(lineorder_matrix_min) + ncol(cust_matrix_min) + 2)];
+
+# CALCULATING COMBINATION KEY WITH PRIORITY: C_NATION, S_NATION, D_YEAR
+max_c_nation = max(c_nation);
+max_s_nation = max(s_nation);
+max_d_year = max(d_year);
+
+c_nation_scale_f = ceil(max_c_nation) + 1;
+s_nation_scale_f = ceil(max_s_nation) + 1;
+d_year_scale_f = ceil(max_d_year) + 1;
+
+combined_key = c_nation * s_nation_scale_f * d_year_scale_f + s_nation * d_year_scale_f + d_year;
+
+group_input = cbind(revenue, combined_key);
+agg_result = raGrp::m_raGroupby(X=group_input, col=2, method="nested-loop");
+
+key = agg_result[, 1];
+revenue = rowSums(agg_result[, 2:ncol(agg_result)]);
+
+# EXTRACTING C_NATION, S_NATION & D_YEAR
+d_year = round(key %% d_year_scale_f);
+c_nation = round(floor(key / (s_nation_scale_f * d_year_scale_f)));
+s_nation = round((floor(key / d_year_scale_f)) %% s_nation_scale_f);
+
+result = cbind(c_nation, s_nation, d_year, revenue);
+
+
+# -- SORTING --
+# PRIORITY 1 D_YEAR (ASC), 2 REVENUE (DESC)
+result_ordered = order(target=result, by=4, decreasing=TRUE, index.return=FALSE);
+result_ordered = order(target=result_ordered, by=3, decreasing=FALSE, index.return=FALSE);
+
+# -- DECODING C_NATION & S_NATION --
+# Map nation codes back to nation names (using original metadata codes)
+print("Processing " + nrow(result_ordered) + " result rows...");
+
+print("Q3.1 Results with nation codes:");
+for (i in 1:nrow(result_ordered)) {
+ c_nation_code = as.scalar(result_ordered[i, 1]);
+ s_nation_code = as.scalar(result_ordered[i, 2]);
+ year_val = as.scalar(result_ordered[i, 3]);
+ revenue_val = as.scalar(result_ordered[i, 4]);
+
+ print(c_nation_code + ".000 " + s_nation_code + ".000 " + year_val + ".000 " + revenue_val + ".000");
+}
+
+# Calculate and print total revenue
+total_revenue = sum(result_ordered[, 4]);
+print("");
+print("TOTAL REVENUE: " + as.integer(total_revenue));
+print("");
+
+for (i in 1:nrow(result_ordered)) {
+ c_nation_code = as.scalar(result_ordered[i, 1]);
+ s_nation_code = as.scalar(result_ordered[i, 2]);
+ year_val = as.scalar(result_ordered[i, 3]);
+ revenue_val = as.scalar(result_ordered[i, 4]);
+
+ # Map customer nation codes back to names
+ c_nation_name = "UNKNOWN";
+ if (c_nation_code == 247) c_nation_name = "CHINA";
+ else if (c_nation_code == 36) c_nation_name = "INDIA";
+ else if (c_nation_code == 243) c_nation_name = "INDONESIA";
+ else if (c_nation_code == 24) c_nation_name = "JAPAN";
+ else if (c_nation_code == 230) c_nation_name = "VIETNAM";
+
+ # Map supplier nation codes back to names
+ s_nation_name = "UNKNOWN";
+ if (s_nation_code == 27) s_nation_name = "CHINA";
+ else if (s_nation_code == 12) s_nation_name = "INDIA";
+ else if (s_nation_code == 48) s_nation_name = "INDONESIA";
+ else if (s_nation_code == 73) s_nation_name = "JAPAN";
+ else if (s_nation_code == 85) s_nation_name = "VIETNAM";
+
+ # Output in consistent format
+ print(c_nation_name + " " + s_nation_name + " " + year_val + ".000 " + revenue_val + ".000");
+}
+
+# Frame format output
+print("");
+print("# FRAME: nrow = " + nrow(result_ordered) + ", ncol = 4");
+print("# C1 C2 C3 C4");
+print("# STRING STRING INT32 INT32");
+
+for (i in 1:nrow(result_ordered)) {
+ c_nation_code = as.scalar(result_ordered[i, 1]);
+ s_nation_code = as.scalar(result_ordered[i, 2]);
+ year_val = as.scalar(result_ordered[i, 3]);
+ revenue_val = as.scalar(result_ordered[i, 4]);
+
+ # Map nation codes to names for frame output
+ c_nation_name = "UNKNOWN";
+ if (c_nation_code == 247) c_nation_name = "CHINA";
+ else if (c_nation_code == 36) c_nation_name = "INDIA";
+ else if (c_nation_code == 243) c_nation_name = "INDONESIA";
+ else if (c_nation_code == 24) c_nation_name = "JAPAN";
+ else if (c_nation_code == 230) c_nation_name = "VIETNAM";
+
+ s_nation_name = "UNKNOWN";
+ if (s_nation_code == 27) s_nation_name = "CHINA";
+ else if (s_nation_code == 12) s_nation_name = "INDIA";
+ else if (s_nation_code == 48) s_nation_name = "INDONESIA";
+ else if (s_nation_code == 73) s_nation_name = "JAPAN";
+ else if (s_nation_code == 85) s_nation_name = "VIETNAM";
+
+ print(c_nation_name + " " + s_nation_name + " " + year_val + " " + revenue_val);
+}
+
diff --git a/scripts/ssb/queries/q3_2.dml b/scripts/ssb/queries/q3_2.dml
new file mode 100644
index 00000000000..a654b693a0c
--- /dev/null
+++ b/scripts/ssb/queries/q3_2.dml
@@ -0,0 +1,215 @@
+/*DML-script implementing the ssb query Q3.2 in SystemDS.
+SELECT
+ c_city,
+ s_city,
+ d_year,
+ SUM(lo_revenue) AS REVENUE
+FROM customer, lineorder, supplier, dates
+WHERE
+ lo_custkey = c_custkey
+ AND lo_suppkey = s_suppkey
+ AND lo_orderdate = d_datekey
+ AND c_nation = 'UNITED STATES'
+ AND s_nation = 'UNITED STATES'
+ AND d_year >= 1992
+ AND d_year <= 1997
+GROUP BY c_city, s_city, d_year
+ORDER BY d_year ASC, REVENUE DESC;
+
+Usage:
+./bin/systemds scripts/ssb/queries/q3_2.dml -nvargs input_dir="/path/to/data"
+./bin/systemds scripts/ssb/queries/q3_2.dml -nvargs input_dir="/Users/ghafekalsaho/Desktop/data"
+
+Parameters:
+input_dir - Path to input directory containing the table files (e.g., ./data)
+*/
+
+# -- SOURCING THE RA-FUNCTIONS --
+source("./scripts/builtin/raSelection.dml") as raSel
+source("./scripts/builtin/raJoin.dml") as raJoin
+source("./scripts/builtin/raGroupby.dml") as raGrp
+
+# -- PARAMETER HANDLING --
+input_dir = ifdef($input_dir, "./data");
+
+
+# -- READING INPUT FILES --
+# CSV TABLES
+date_csv = read(input_dir + "/date.tbl", data_type="frame", format="csv", header=FALSE, sep="|");
+lineorder_csv = read(input_dir + "/lineorder.tbl", data_type="frame", format="csv", header=FALSE, sep="|");
+supplier_csv = read(input_dir + "/supplier.tbl", data_type="frame", format="csv", header=FALSE, sep="|");
+customer_csv = read(input_dir + "/customer.tbl", data_type="frame", format="csv", header=FALSE, sep="|");
+
+# -- PREPARING --
+# Optimized approach: On-the-fly filtering with direct matrix construction for string fields
+
+# EXTRACTING MINIMAL DATE DATA TO OPTIMIZE RUNTIME => COL-1 : DATE-KEY | COL-5 : D_YEAR
+date_csv_min = cbind(date_csv[, 1], date_csv[, 5]);
+date_matrix_min = as.matrix(date_csv_min);
+
+# EXTRACTING MINIMAL LINEORDER DATA TO OPTIMIZE RUNTIME => COL-3 : LO_CUSTKEY | COL-5 : LO_SUPPKEY |
+# COL-6 : LO_ORDERDATE | COL-13 : LO_REVENUE
+lineorder_csv_min = cbind(lineorder_csv[, 3], lineorder_csv[, 5], lineorder_csv[, 6], lineorder_csv[, 13]);
+lineorder_matrix_min = as.matrix(lineorder_csv_min);
+
+# ON-THE-FLY CUSTOMER TABLE FILTERING AND ENCODING (C_NATION = 'UNITED STATES')
+# Two-pass approach: Count first, then filter and encode
+customer_keys_matrix = as.matrix(customer_csv[, 1]); # customer_key
+customer_nrows = nrow(customer_csv);
+us_customer_count = 0;
+
+# Pass 1: Count matching customers (nation = UNITED STATES)
+for (i in 1:customer_nrows) {
+ nation_val = as.scalar(customer_csv[i, 5]); # c_nation
+ if (nation_val == "UNITED STATES") {
+ us_customer_count = us_customer_count + 1;
+ }
+}
+
+# Pass 2: Build customer matrix with proper city and nation encoding
+cust_matrix_min = matrix(0, us_customer_count, 3); # custkey, city_code, nation_code
+filtered_idx = 0;
+
+for (i in 1:customer_nrows) {
+ nation_val = as.scalar(customer_csv[i, 5]); # c_nation
+ if (nation_val == "UNITED STATES") {
+ filtered_idx = filtered_idx + 1;
+ city_val = as.scalar(customer_csv[i, 4]); # c_city
+
+ cust_matrix_min[filtered_idx, 1] = as.scalar(customer_keys_matrix[i, 1]); # customer_key
+ cust_matrix_min[filtered_idx, 3] = 1; # encoded value for UNITED STATES nation
+
+ # Assign city codes dynamically based on city names
+ # Use filtered index for simple unique encoding
+ city_code = filtered_idx;
+ cust_matrix_min[filtered_idx, 2] = city_code;
+ }
+}
+
+# ON-THE-FLY SUPPLIER TABLE FILTERING AND ENCODING (S_NATION = 'UNITED STATES')
+# Two-pass approach for suppliers
+supplier_keys_matrix = as.matrix(supplier_csv[, 1]); # supplier_key
+supplier_nrows = nrow(supplier_csv);
+us_supplier_count = 0;
+
+# Pass 1: Count matching suppliers
+for (i in 1:supplier_nrows) {
+ nation_val = as.scalar(supplier_csv[i, 5]); # s_nation
+ if (nation_val == "UNITED STATES") {
+ us_supplier_count = us_supplier_count + 1;
+ }
+}
+
+# Pass 2: Build supplier matrix with city encoding (independent from customer cities)
+sup_matrix_min = matrix(0, us_supplier_count, 3); # suppkey, city_code, nation_code
+filtered_idx = 0;
+
+for (i in 1:supplier_nrows) {
+ nation_val = as.scalar(supplier_csv[i, 5]); # s_nation
+ if (nation_val == "UNITED STATES") {
+ filtered_idx = filtered_idx + 1;
+ city_val = as.scalar(supplier_csv[i, 4]); # s_city
+
+ sup_matrix_min[filtered_idx, 1] = as.scalar(supplier_keys_matrix[i, 1]); # supplier_key
+ sup_matrix_min[filtered_idx, 3] = 1; # encoded value for UNITED STATES nation
+
+ # Assign city codes dynamically based on city names
+ # Use filtered index for simple unique encoding
+ city_code = filtered_idx;
+ sup_matrix_min[filtered_idx, 2] = city_code;
+ }
+}
+
+# -- FILTERING THE DATA WITH RA-SELECTION FUNCTION --
+# We already filtered during matrix construction, but we can use RA selection for consistency
+# C_NATION = 'UNITED STATES' : 1 (Our encoded value)
+c_nat_filt = raSel::m_raSelection(cust_matrix_min, col=3, op="==", val=1);
+
+# S_NATION = 'UNITED STATES' : 1 (Our encoded value)
+s_nat_filt = raSel::m_raSelection(sup_matrix_min, col=3, op="==", val=1);
+
+# D_YEAR BETWEEN 1992 & 1997
+d_year_filt = raSel::m_raSelection(date_matrix_min, col=2, op=">=", val=1992);
+d_year_filt = raSel::m_raSelection(d_year_filt, col=2, op="<=", val=1997);
+
+
+# -- JOIN TABLES WITH RA-JOIN FUNCTION --
+# JOINING MINIMIZED LINEORDER TABLE WITH FILTERED CUSTOMER TABLE WHERE LO_CUSTKEY = C_CUSTKEY
+lo_cust = raJoin::m_raJoin(A=lineorder_matrix_min, colA=1, B=c_nat_filt, colB=1, method="sort-merge");
+
+# JOIN: ⨝ SUPPLIER WHERE LO_SUPPKEY = S_SUPPKEY
+lo_cust_sup = raJoin::m_raJoin(A=lo_cust, colA=2, B=s_nat_filt, colB=1, method="sort-merge");
+
+# JOIN: ⨝ DATE WHERE LO_ORDERDATE = D_DATEKEY
+joined_matrix = raJoin::m_raJoin(A=lo_cust_sup, colA=3, B=d_year_filt, colB=1, method="sort-merge");
+
+
+# -- GROUP-BY & AGGREGATION --
+# LO_REVENUE : COLUMN 4 OF LINEORDER-MIN-MATRIX (was 5, now 4 since we removed LO_PARTKEY)
+revenue = joined_matrix[, 4];
+# D_YEAR : COLUMN 2 OF DATE-MIN-MATRIX
+d_year = joined_matrix[,(ncol(lineorder_matrix_min) + ncol(cust_matrix_min) + ncol(sup_matrix_min) + 2)];
+# C_CITY : COLUMN 2 OF CUST-MIN-MATRIX
+c_city = joined_matrix[,(ncol(lineorder_matrix_min) + 2)];
+# S_CITY : COLUMN 2 OF SUP-MIN-MATRIX
+s_city = joined_matrix[,(ncol(lineorder_matrix_min) + ncol(cust_matrix_min) + 2)];
+
+# CALCULATING COMBINATION KEY WITH PRIORITY: C_CITY, S_CITY & D_YEAR
+max_c_city = max(c_city);
+max_s_city = max(s_city);
+max_d_year = max(d_year);
+
+c_city_scale_f = ceil(max_c_city) + 1;
+s_city_scale_f = ceil(max_s_city) + 1;
+d_year_scale_f = ceil(max_d_year) + 1;
+
+combined_key = c_city * s_city_scale_f * d_year_scale_f + s_city * d_year_scale_f + d_year;
+
+group_input = cbind(revenue, combined_key);
+agg_result = raGrp::m_raGroupby(X=group_input, col=2, method="nested-loop");
+
+key = agg_result[, 1];
+revenue = rowSums(agg_result[, 2:ncol(agg_result)]);
+
+# EXTRACTING C_CITY, S_CITY & D_YEAR
+d_year = round(key %% d_year_scale_f);
+c_city = round(floor(key / (s_city_scale_f * d_year_scale_f)));
+s_city = round((floor(key / d_year_scale_f)) %% s_city_scale_f);
+
+result = cbind(c_city, s_city, d_year, revenue);
+
+
+# -- SORTING --
+# PRIORITY 1 D_YEAR (ASC), 2 REVENUE (DESC)
+result_ordered = order(target=result, by=4, decreasing=TRUE, index.return=FALSE);
+result_ordered = order(target=result_ordered, by=3, decreasing=FALSE, index.return=FALSE);
+
+
+# -- DECODING C_CITY & S_CITY CODES --
+# For simplicity, we'll output the city codes rather than names
+# This follows the same pattern as q3_1.dml which outputs nation codes
+print("Q3.2 Results:");
+print("# FRAME: nrow = " + nrow(result_ordered) + ", ncol = 4");
+print("# C1 C2 C3 C4");
+print("# STRING STRING INT32 INT32");
+
+for (i in 1:nrow(result_ordered)) {
+ c_city_code = as.scalar(result_ordered[i, 1]);
+ s_city_code = as.scalar(result_ordered[i, 2]);
+ year_val = as.scalar(result_ordered[i, 3]);
+ revenue_val = as.scalar(result_ordered[i, 4]);
+
+ # For now, output the codes - we can map them back to names later if needed
+ c_city_name = "UNITED ST" + c_city_code; # Format similar to expected output
+ s_city_name = "UNITED ST" + s_city_code; # Format similar to expected output
+
+ print(c_city_name + " " + s_city_name + " " + year_val + " " + revenue_val);
+}
+
+# Calculate total revenue for validation
+total_revenue = sum(result_ordered[, 4]);
+print("");
+print("Total number of result rows: " + nrow(result_ordered));
+print("Total revenue: " + as.integer(total_revenue));
+print("Q3.2 finished");
+
diff --git a/scripts/ssb/queries/q3_3.dml b/scripts/ssb/queries/q3_3.dml
new file mode 100644
index 00000000000..921fd00b501
--- /dev/null
+++ b/scripts/ssb/queries/q3_3.dml
@@ -0,0 +1,217 @@
+/* DML-script implementing the ssb query Q3.3 in SystemDS.
+SELECT
+ c_city,
+ s_city,
+ d_year,
+ SUM(lo_revenue) AS REVENUE
+FROM customer, lineorder, supplier, dates
+WHERE
+ lo_custkey = c_custkey
+ AND lo_suppkey = s_suppkey
+ AND lo_orderdate = d_datekey
+ AND (
+ c_city = 'UNITED KI1'
+ OR c_city = 'UNITED KI5'
+ )
+ AND (
+ s_city = 'UNITED KI1'
+ OR s_city = 'UNITED KI5'
+ )
+ AND d_year >= 1992
+ AND d_year <= 1997
+GROUP BY c_city, s_city, d_year
+ORDER BY d_year ASC, REVENUE DESC;
+*/
+
+# -- PARAMETER HANDLING --
+input_dir = ifdef($input_dir, "./data");
+
+# -- SOURCING THE RA-FUNCTIONS --
+source("./scripts/builtin/raSelection.dml") as raSel
+source("./scripts/builtin/raJoin.dml") as raJoin
+source("./scripts/builtin/raGroupby.dml") as raGrp
+
+
+# -- READING INPUT FILES --
+# CSV TABLES
+date_csv = read(input_dir + "/date.tbl", data_type="frame", format="csv", header=FALSE, sep="|");
+lineorder_csv = read(input_dir + "/lineorder.tbl", data_type="frame", format="csv", header=FALSE, sep="|");
+#part_csv = read(input_dir + "/part.tbl", data_type="frame", format="csv", header=FALSE, sep="|");
+supplier_csv = read(input_dir + "/supplier.tbl", data_type="frame", format="csv", header=FALSE, sep="|");
+customer_csv = read(input_dir + "/customer.tbl", data_type="frame", format="csv", header=FALSE, sep="|");
+
+
+# -- PREPARING --
+# EXTRACTING MINIMAL DATE DATA TO OPTIMIZE RUNTIME => COL-1 : DATE-KEY | COL-5 : D_YEAR
+date_csv_min = cbind(date_csv[, 1], date_csv[, 5]);
+date_matrix_min = as.matrix(date_csv_min);
+
+# EXTRACTING MINIMAL LINEORDER DATA TO OPTIMIZE RUNTIME => COL-3 : LO_CUSTKEY | COL-4 : LO_PARTKEY |
+# COL-5 : LO_SUPPKEY | COL-6 : LO_ORDERDATE | COL-13 : LO_REVENUE
+lineorder_csv_min = cbind(lineorder_csv[, 3], lineorder_csv[, 4], lineorder_csv[, 5], lineorder_csv[, 6], lineorder_csv[, 13]);
+lineorder_matrix_min = as.matrix(lineorder_csv_min);
+
+# ON-THE-FLY CUSTOMER TABLE FILTERING AND ENCODING (C_CITY = 'UNITED KI1' OR 'UNITED KI5')
+customer_keys_matrix = as.matrix(customer_csv[, 1]); # customer_key
+customer_nrows = nrow(customer_csv);
+matching_customer_count = 0;
+
+# Pass 1: Count matching customers
+for (i in 1:customer_nrows) {
+ city_val = as.scalar(customer_csv[i, 4]); # c_city
+ if (city_val == "UNITED KI1" | city_val == "UNITED KI5") {
+ matching_customer_count = matching_customer_count + 1;
+ }
+}
+
+# Pass 2: Build customer matrix with dynamic city encoding
+cust_matrix_min = matrix(0, matching_customer_count, 2); # custkey, city_code
+filtered_idx = 0;
+
+for (i in 1:customer_nrows) {
+ city_val = as.scalar(customer_csv[i, 4]); # c_city
+ if (city_val == "UNITED KI1" | city_val == "UNITED KI5") {
+ filtered_idx = filtered_idx + 1;
+ cust_matrix_min[filtered_idx, 1] = as.scalar(customer_keys_matrix[i, 1]); # customer_key
+
+ # Use consistent encoding: 1 for UNITED KI1, 2 for UNITED KI5
+ if (city_val == "UNITED KI1") {
+ cust_matrix_min[filtered_idx, 2] = 1;
+ } else {
+ cust_matrix_min[filtered_idx, 2] = 2;
+ }
+ }
+}
+
+# ON-THE-FLY SUPPLIER TABLE FILTERING AND ENCODING (S_CITY = 'UNITED KI1' OR 'UNITED KI5')
+supplier_keys_matrix = as.matrix(supplier_csv[, 1]); # supplier_key
+supplier_nrows = nrow(supplier_csv);
+matching_supplier_count = 0;
+
+# Pass 1: Count matching suppliers
+for (i in 1:supplier_nrows) {
+ city_val = as.scalar(supplier_csv[i, 4]); # s_city
+ if (city_val == "UNITED KI1" | city_val == "UNITED KI5") {
+ matching_supplier_count = matching_supplier_count + 1;
+ }
+}
+
+# Pass 2: Build supplier matrix with dynamic city encoding
+sup_matrix_min = matrix(0, matching_supplier_count, 2); # suppkey, city_code
+filtered_idx = 0;
+
+for (i in 1:supplier_nrows) {
+ city_val = as.scalar(supplier_csv[i, 4]); # s_city
+ if (city_val == "UNITED KI1" | city_val == "UNITED KI5") {
+ filtered_idx = filtered_idx + 1;
+ sup_matrix_min[filtered_idx, 1] = as.scalar(supplier_keys_matrix[i, 1]); # supplier_key
+
+ # Use consistent encoding: 1 for UNITED KI1, 2 for UNITED KI5
+ if (city_val == "UNITED KI1") {
+ sup_matrix_min[filtered_idx, 2] = 1;
+ } else {
+ sup_matrix_min[filtered_idx, 2] = 2;
+ }
+ }
+}
+
+
+# -- FILTERING THE DATA WITH RA-SELECTION FUNCTION --
+# Since we already filtered during matrix construction, we can use the full matrices
+# or apply additional RA selection if needed for consistency
+c_city_filt = cust_matrix_min; # Already filtered for target cities
+s_city_filt = sup_matrix_min; # Already filtered for target cities
+
+# D_YEAR BETWEEN 1992 & 1997
+d_year_filt = raSel::m_raSelection(date_matrix_min, col=2, op=">=", val=1992);
+d_year_filt = raSel::m_raSelection(d_year_filt, col=2, op="<=", val=1997);
+
+
+# -- JOIN TABLES WITH RA-JOIN FUNCTION --
+# JOINING MINIMIZED LINEORDER TABLE WITH FILTERED CUSTOMER TABLE WHERE LO_CUSTKEY = C_CUSTKEY
+lo_cust = raJoin::m_raJoin(A=lineorder_matrix_min, colA=1, B=c_city_filt, colB=1, method="sort-merge");
+
+# JOIN: ⨝ SUPPLIER WHERE LO_SUPPKEY = S_SUPPKEY
+lo_cust_sup = raJoin::m_raJoin(A=lo_cust, colA=3, B=s_city_filt, colB=1, method="sort-merge");
+
+# JOIN: ⨝ DATE WHERE LO_ORDERDATE = D_DATEKEY
+joined_matrix = raJoin::m_raJoin(A=lo_cust_sup, colA=4, B=d_year_filt, colB=1, method="sort-merge");
+#print(nrow(joined_matrix));
+
+
+# -- GROUP-BY & AGGREGATION --
+# LO_REVENUE : COLUMN 5 OF LINEORDER-MIN-MATRIX
+revenue = joined_matrix[, 5];
+# D_YEAR : COLUMN 2 OF DATE-MIN-MATRIX
+d_year = joined_matrix[,(ncol(lineorder_matrix_min) + ncol(cust_matrix_min) + ncol(sup_matrix_min) + 2)];
+# C_CITY : COLUMN 2 OF CUST-MIN-MATRIX
+c_city = joined_matrix[,(ncol(lineorder_matrix_min) + 2)];
+# S_CITY : COLUMN 2 OF CUST-MIN-MATRIX
+s_city = joined_matrix[,(ncol(lineorder_matrix_min) + ncol(cust_matrix_min) + 2)];
+
+# CALCULATING COMBINATION KEY WITH PRIORITY: C_CITY, S_CITY & D_YEAR
+max_c_city = max(c_city);
+max_s_city = max(s_city);
+max_d_year = max(d_year);
+
+c_city_scale_f = ceil(max_c_city) + 1;
+s_city_scale_f = ceil(max_s_city) + 1;
+d_year_scale_f = ceil(max_d_year) + 1;
+
+combined_key = c_city * s_city_scale_f * d_year_scale_f + s_city * d_year_scale_f + d_year;
+
+group_input = cbind(revenue, combined_key);
+agg_result = raGrp::m_raGroupby(X=group_input, col=2, method="nested-loop");
+
+key = agg_result[, 1];
+revenue = rowSums(agg_result[, 2:ncol(agg_result)]);
+
+# EXTRACTING C_CITY, S_CITY & D_YEAR
+d_year = round(key %% d_year_scale_f);
+c_city = round(floor(key / (s_city_scale_f * d_year_scale_f)));
+s_city = round((floor(key / d_year_scale_f)) %% s_city_scale_f);
+
+result = cbind(c_city, s_city, d_year, revenue);
+
+
+# -- SORTING --
+# PRIORITY 1 D_YEAR (ASC), 2 REVENUE (DESC)
+result_ordered = order(target=result, by=4, decreasing=TRUE, index.return=FALSE);
+result_ordered = order(target=result_ordered, by=3, decreasing=FALSE, index.return=FALSE);
+
+
+# -- OUTPUT RESULTS --
+print("Q3.3 Results:");
+print("# FRAME: nrow = " + nrow(result_ordered) + ", ncol = 4");
+print("# C1 C2 C3 C4");
+print("# STRING STRING INT32 INT32");
+
+for (i in 1:nrow(result_ordered)) {
+ c_city_code = as.scalar(result_ordered[i, 1]);
+ s_city_code = as.scalar(result_ordered[i, 2]);
+ year_val = as.scalar(result_ordered[i, 3]);
+ revenue_val = as.scalar(result_ordered[i, 4]);
+
+ # Map back to original city names based on the encoding used
+ if (c_city_code == 1) {
+ c_city_name = "UNITED KI1";
+ } else {
+ c_city_name = "UNITED KI5";
+ }
+
+ if (s_city_code == 1) {
+ s_city_name = "UNITED KI1";
+ } else {
+ s_city_name = "UNITED KI5";
+ }
+
+ print(c_city_name + " " + s_city_name + " " + as.integer(year_val) + " " + as.integer(revenue_val));
+}
+
+# Calculate total revenue for validation
+total_revenue = sum(result_ordered[, 4]);
+print("");
+print("Total number of result rows: " + nrow(result_ordered));
+print("Total revenue: " + as.integer(total_revenue));
+print("Q3.3 finished");
+
diff --git a/scripts/ssb/queries/q3_4.dml b/scripts/ssb/queries/q3_4.dml
new file mode 100644
index 00000000000..61327c6dfd7
--- /dev/null
+++ b/scripts/ssb/queries/q3_4.dml
@@ -0,0 +1,240 @@
+/* DML-script implementing the ssb query Q3.4 in SystemDS.
+SELECT
+ c_city,
+ s_city,
+ d_year,
+ SUM(lo_revenue) AS REVENUE
+FROM customer, lineorder, supplier, dates
+WHERE
+ lo_custkey = c_custkey
+ AND lo_suppkey = s_suppkey
+ AND lo_orderdate = d_datekey
+ AND (
+ c_city = 'UNITED KI1'
+ OR c_city = 'UNITED KI5'
+ )
+ AND (
+ s_city = 'UNITED KI1'
+ OR s_city = 'UNITED KI5'
+ )
+ AND d_yearmonth = 'Dec1997'
+GROUP BY c_city, s_city, d_year
+ORDER BY d_year ASC, REVENUE DESC;
+*/
+
+# -- PARAMETER HANDLING --
+input_dir = ifdef($input_dir, "./data");
+
+# -- SOURCING THE RA-FUNCTIONS --
+source("./scripts/builtin/raSelection.dml") as raSel
+source("./scripts/builtin/raJoin.dml") as raJoin
+source("./scripts/builtin/raGroupby.dml") as raGrp
+
+
+# -- READING INPUT FILES --
+# CSV TABLES
+date_csv = read(input_dir + "/date.tbl", data_type="frame", format="csv", header=FALSE, sep="|");
+lineorder_csv = read(input_dir + "/lineorder.tbl", data_type="frame", format="csv", header=FALSE, sep="|");
+#part_csv = read(input_dir + "/part.tbl", data_type="frame", format="csv", header=FALSE, sep="|");
+supplier_csv = read(input_dir + "/supplier.tbl", data_type="frame", format="csv", header=FALSE, sep="|");
+customer_csv = read(input_dir + "/customer.tbl", data_type="frame", format="csv", header=FALSE, sep="|");
+
+
+# -- PREPARING --
+# EXTRACTING MINIMAL DATE DATA TO OPTIMIZE RUNTIME => COL-1 : DATE-KEY | COL-5 : D_YEAR
+date_csv_min = cbind(date_csv[, 1], date_csv[, 5]);
+date_matrix_min = as.matrix(date_csv_min);
+
+# EXTRACTING MINIMAL LINEORDER DATA TO OPTIMIZE RUNTIME => COL-3 : LO_CUSTKEY | COL-4 : LO_PARTKEY |
+# COL-5 : LO_SUPPKEY | COL-6 : LO_ORDERDATE | COL-13 : LO_REVENUE
+lineorder_csv_min = cbind(lineorder_csv[, 3], lineorder_csv[, 4], lineorder_csv[, 5], lineorder_csv[, 6], lineorder_csv[, 13]);
+lineorder_matrix_min = as.matrix(lineorder_csv_min);
+
+# ON-THE-FLY CUSTOMER TABLE FILTERING AND ENCODING (C_CITY = 'UNITED KI1' OR 'UNITED KI5')
+customer_keys_matrix = as.matrix(customer_csv[, 1]); # customer_key
+customer_nrows = nrow(customer_csv);
+matching_customer_count = 0;
+
+# Pass 1: Count matching customers
+for (i in 1:customer_nrows) {
+ city_val = as.scalar(customer_csv[i, 4]); # c_city
+ if (city_val == "UNITED KI1" | city_val == "UNITED KI5") {
+ matching_customer_count = matching_customer_count + 1;
+ }
+}
+
+# Pass 2: Build customer matrix with dynamic city encoding
+cust_matrix_min = matrix(0, matching_customer_count, 2); # custkey, city_code
+filtered_idx = 0;
+
+for (i in 1:customer_nrows) {
+ city_val = as.scalar(customer_csv[i, 4]); # c_city
+ if (city_val == "UNITED KI1" | city_val == "UNITED KI5") {
+ filtered_idx = filtered_idx + 1;
+ cust_matrix_min[filtered_idx, 1] = as.scalar(customer_keys_matrix[i, 1]); # customer_key
+
+ # Use consistent encoding: 1 for UNITED KI1, 2 for UNITED KI5
+ if (city_val == "UNITED KI1") {
+ cust_matrix_min[filtered_idx, 2] = 1;
+ } else {
+ cust_matrix_min[filtered_idx, 2] = 2;
+ }
+ }
+}
+
+# ON-THE-FLY SUPPLIER TABLE FILTERING AND ENCODING (S_CITY = 'UNITED KI1' OR 'UNITED KI5')
+supplier_keys_matrix = as.matrix(supplier_csv[, 1]); # supplier_key
+supplier_nrows = nrow(supplier_csv);
+matching_supplier_count = 0;
+
+# Pass 1: Count matching suppliers
+for (i in 1:supplier_nrows) {
+ city_val = as.scalar(supplier_csv[i, 4]); # s_city
+ if (city_val == "UNITED KI1" | city_val == "UNITED KI5") {
+ matching_supplier_count = matching_supplier_count + 1;
+ }
+}
+
+# Pass 2: Build supplier matrix with dynamic city encoding
+sup_matrix_min = matrix(0, matching_supplier_count, 2); # suppkey, city_code
+filtered_idx = 0;
+
+for (i in 1:supplier_nrows) {
+ city_val = as.scalar(supplier_csv[i, 4]); # s_city
+ if (city_val == "UNITED KI1" | city_val == "UNITED KI5") {
+ filtered_idx = filtered_idx + 1;
+ sup_matrix_min[filtered_idx, 1] = as.scalar(supplier_keys_matrix[i, 1]); # supplier_key
+
+ # Use consistent encoding: 1 for UNITED KI1, 2 for UNITED KI5
+ if (city_val == "UNITED KI1") {
+ sup_matrix_min[filtered_idx, 2] = 1;
+ } else {
+ sup_matrix_min[filtered_idx, 2] = 2;
+ }
+ }
+}
+
+
+# -- FILTERING THE DATA WITH RA-SELECTION FUNCTION --
+# Since we already filtered during matrix construction, we can use the full matrices
+c_city_filt = cust_matrix_min; # Already filtered for target cities
+s_city_filt = sup_matrix_min; # Already filtered for target cities
+
+# D_YEARMONTH = 'Dec1997' - Need precise filtering for Dec1997 only
+# Build filtered date matrix manually since we need string matching on d_yearmonth
+date_full_frame = cbind(date_csv[, 1], date_csv[, 5], date_csv[, 7]); # datekey, year, yearmonth
+date_nrows = nrow(date_full_frame);
+matching_dates = matrix(0, 31, 2); # We know 31 entries exist, store datekey and year
+filtered_idx = 0;
+
+for (i in 1:date_nrows) {
+ yearmonth_val = as.scalar(date_full_frame[i, 3]); # d_yearmonth
+ if (yearmonth_val == "Dec1997") {
+ filtered_idx = filtered_idx + 1;
+ matching_dates[filtered_idx, 1] = as.scalar(date_matrix_min[i, 1]); # datekey
+ matching_dates[filtered_idx, 2] = as.scalar(date_matrix_min[i, 2]); # d_year
+ }
+}
+
+d_year_filt = matching_dates;
+
+
+# -- JOIN TABLES WITH RA-JOIN FUNCTION --
+# JOINING MINIMIZED LINEORDER TABLE WITH FILTERED CUSTOMER TABLE WHERE LO_CUSTKEY = C_CUSTKEY
+lo_cust = raJoin::m_raJoin(A=lineorder_matrix_min, colA=1, B=c_city_filt, colB=1, method="sort-merge");
+
+# JOIN: ⨝ SUPPLIER WHERE LO_SUPPKEY = S_SUPPKEY
+lo_cust_sup = raJoin::m_raJoin(A=lo_cust, colA=3, B=s_city_filt, colB=1, method="sort-merge");
+
+# JOIN: ⨝ DATE WHERE LO_ORDERDATE = D_DATEKEY
+joined_matrix = raJoin::m_raJoin(A=lo_cust_sup, colA=4, B=d_year_filt, colB=1, method="sort-merge");
+
+# Check if we have any results
+if (nrow(joined_matrix) == 0) {
+ print("Q3.4 Results:");
+ print("# FRAME: nrow = 0, ncol = 4");
+ print("# C1 C2 C3 C4");
+ print("# STRING STRING INT32 INT32");
+ print("");
+ print("Total number of result rows: 0");
+ print("Total revenue: 0");
+ print("Q3.4 finished - no matching data for Dec1997");
+} else {
+
+
+# -- GROUP-BY & AGGREGATION --
+# LO_REVENUE : COLUMN 5 OF LINEORDER-MIN-MATRIX
+revenue = joined_matrix[, 5];
+# D_YEAR : COLUMN 2 OF DATE-MIN-MATRIX
+d_year = joined_matrix[,(ncol(lineorder_matrix_min) + ncol(cust_matrix_min) + ncol(sup_matrix_min) + 2)];
+# C_CITY : COLUMN 2 OF CUST-MIN-MATRIX
+c_city = joined_matrix[,(ncol(lineorder_matrix_min) + 2)];
+# S_CITY : COLUMN 2 OF CUST-MIN-MATRIX
+s_city = joined_matrix[,(ncol(lineorder_matrix_min) + ncol(cust_matrix_min) + 2)];
+
+# CALCULATING COMBINATION KEY WITH PRIORITY: C_CITY, S_CITY & D_YEAR
+max_c_city = max(c_city);
+max_s_city = max(s_city);
+max_d_year = max(d_year);
+
+c_city_scale_f = ceil(max_c_city) + 1;
+s_city_scale_f = ceil(max_s_city) + 1;
+d_year_scale_f = ceil(max_d_year) + 1;
+
+combined_key = c_city * s_city_scale_f * d_year_scale_f + s_city * d_year_scale_f + d_year;
+
+group_input = cbind(revenue, combined_key);
+agg_result = raGrp::m_raGroupby(X=group_input, col=2, method="nested-loop");
+
+key = agg_result[, 1];
+revenue = rowSums(agg_result[, 2:ncol(agg_result)]);
+
+# EXTRACTING C_CITY, S_CITY & D_YEAR
+d_year = round(key %% d_year_scale_f);
+c_city = round(floor(key / (s_city_scale_f * d_year_scale_f)));
+s_city = round((floor(key / d_year_scale_f)) %% s_city_scale_f);
+
+result = cbind(c_city, s_city, d_year, revenue);
+
+
+# -- SORTING --
+# PRIORITY 1 D_YEAR (ASC), 2 REVENUE (DESC)
+result_ordered = order(target=result, by=4, decreasing=TRUE, index.return=FALSE);
+result_ordered = order(target=result_ordered, by=3, decreasing=FALSE, index.return=FALSE);
+
+
+# -- OUTPUT RESULTS --
+print("Q3.4 Results:");
+print("# FRAME: nrow = " + nrow(result_ordered) + ", ncol = 4");
+print("# C1 C2 C3 C4");
+print("# STRING STRING INT32 INT32");
+
+for (i in 1:nrow(result_ordered)) {
+ c_city_code = as.scalar(result_ordered[i, 1]);
+ s_city_code = as.scalar(result_ordered[i, 2]);
+ year_val = as.scalar(result_ordered[i, 3]);
+ revenue_val = as.scalar(result_ordered[i, 4]);
+
+ # Map back to original city names based on the encoding used
+ if (c_city_code == 1) {
+ c_city_name = "UNITED KI1";
+ } else {
+ c_city_name = "UNITED KI5";
+ }
+
+ if (s_city_code == 1) {
+ s_city_name = "UNITED KI1";
+ } else {
+ s_city_name = "UNITED KI5";
+ }
+
+ print(c_city_name + " " + s_city_name + " " + as.integer(year_val) + " " + as.integer(revenue_val));
+}
+
+# Calculate total revenue for validation
+total_revenue = sum(result_ordered[, 4]);
+print("");
+print("Total number of result rows: " + nrow(result_ordered));
+print("Total revenue: " + as.integer(total_revenue));
+print("Q3.4 finished");
+}
diff --git a/scripts/ssb/queries/q4_1.dml b/scripts/ssb/queries/q4_1.dml
new file mode 100644
index 00000000000..d5d4c078662
--- /dev/null
+++ b/scripts/ssb/queries/q4_1.dml
@@ -0,0 +1,242 @@
+/* DML-script implementing the ssb query Q4.1 in SystemDS with Dynamic Encoding.
+SELECT
+ d_year,
+ c_nation,
+ SUM(lo_revenue - lo_supplycost) AS PROFIT
+FROM dates, customer, supplier, part, lineorder
+WHERE
+ lo_custkey = c_custkey
+ AND lo_suppkey = s_suppkey
+ AND lo_partkey = p_partkey
+ AND lo_orderdate = d_datekey
+ AND c_region = 'AMERICA'
+ AND s_region = 'AMERICA'
+ AND (
+ p_mfgr = 'MFGR#1'
+ OR p_mfgr = 'MFGR#2'
+ )
+GROUP BY d_year, c_nation
+ORDER BY d_year, c_nation;
+*/
+
+# Input parameter
+input_dir = $input_dir;
+
+# -- SOURCING THE RA-FUNCTIONS --
+source("./scripts/builtin/raSelection.dml") as raSel
+source("./scripts/builtin/raJoin.dml") as raJoin
+source("./scripts/builtin/raGroupby.dml") as raGrp
+
+
+# -- READING INPUT FILES --
+# CSV TABLES
+date_csv = read(input_dir + "/date.tbl", data_type="frame", format="csv", header=FALSE, sep="|");
+lineorder_csv = read(input_dir + "/lineorder.tbl", data_type="frame", format="csv", header=FALSE, sep="|");
+part_csv = read(input_dir + "/part.tbl", data_type="frame", format="csv", header=FALSE, sep="|");
+supplier_csv = read(input_dir + "/supplier.tbl", data_type="frame", format="csv", header=FALSE, sep="|");
+customer_csv = read(input_dir + "/customer.tbl", data_type="frame", format="csv", header=FALSE, sep="|");
+
+
+# -- MANUAL FILTERING AND DATA PREPARATION --
+# Extract minimal data needed for the query
+date_matrix_min = as.matrix(cbind(date_csv[, 1], date_csv[, 5]));
+lineorder_matrix_min = as.matrix(cbind(lineorder_csv[, 3], lineorder_csv[, 4], lineorder_csv[, 5],
+ lineorder_csv[, 6], lineorder_csv[, 13], lineorder_csv[, 14]));
+
+# Build filtered parts list (MFGR#1 and MFGR#2)
+part_filtered_keys = matrix(0, rows=0, cols=1);
+
+for(i in 1:nrow(part_csv)) {
+ mfgr_val = as.scalar(part_csv[i, 3]);
+ if(mfgr_val == "MFGR#1" | mfgr_val == "MFGR#2") {
+ # Extract key and create single-element matrix
+ key_val = as.double(as.scalar(part_csv[i, 1]));
+ key_matrix = matrix(key_val, rows=1, cols=1);
+
+ # Append to filtered results
+ part_filtered_keys = rbind(part_filtered_keys, key_matrix);
+ }
+}
+part_count = nrow(part_filtered_keys);
+if(part_count == 0) {
+ part_filtered_keys = matrix(0, rows=1, cols=1); # Fallback for empty case
+}
+
+# Build filtered customers list (AMERICA region) with dynamic encoding
+cust_filtered_keys = matrix(0, rows=0, cols=1);
+cust_filtered_nations = matrix(0, rows=0, cols=1);
+
+for(i in 1:nrow(customer_csv)) {
+ region_val = as.scalar(customer_csv[i, 6]);
+ if(region_val == "AMERICA") {
+ # Extract key and create single-element matrix
+ key_val = as.double(as.scalar(customer_csv[i, 1]));
+ key_matrix = matrix(key_val, rows=1, cols=1);
+
+ # Extract nation and encode
+ nation_str = as.scalar(customer_csv[i, 5]);
+ if(nation_str == "ARGENTINA") {
+ nation_val = 3;
+ } else if(nation_str == "CANADA") {
+ nation_val = 5;
+ } else if(nation_str == "PERU") {
+ nation_val = 8;
+ } else if(nation_str == "BRAZIL") {
+ nation_val = 13;
+ } else if(nation_str == "UNITED STATES") {
+ nation_val = 25;
+ } else {
+ nation_val = 0; # Unknown nation
+ }
+ nation_matrix = matrix(nation_val, rows=1, cols=1);
+
+ # Append to filtered results
+ cust_filtered_keys = rbind(cust_filtered_keys, key_matrix);
+ cust_filtered_nations = rbind(cust_filtered_nations, nation_matrix);
+ }
+}
+
+cust_count = nrow(cust_filtered_keys);
+if(cust_count > 0) {
+ # Create customer matrix from filtered data
+ cust_filtered_data = cbind(cust_filtered_keys, cust_filtered_nations);
+} else {
+ cust_filtered_data = matrix(0, rows=1, cols=2); # Fallback for empty case
+}
+
+# Build filtered suppliers list (AMERICA region)
+supp_filtered_keys = matrix(0, rows=0, cols=1);
+
+for(i in 1:nrow(supplier_csv)) {
+ region_val = as.scalar(supplier_csv[i, 6]);
+ if(region_val == "AMERICA") {
+ # Extract key and create single-element matrix
+ key_val = as.double(as.scalar(supplier_csv[i, 1]));
+ key_matrix = matrix(key_val, rows=1, cols=1);
+
+ # Append to filtered results
+ supp_filtered_keys = rbind(supp_filtered_keys, key_matrix);
+ }
+}
+supp_count = nrow(supp_filtered_keys);
+if(supp_count == 0) {
+ supp_filtered_keys = matrix(0, rows=1, cols=1); # Fallback for empty case
+}
+
+# Ensure filtered matrices are properly formatted
+if(cust_count > 0) {
+ cust_matrix_formatted = cust_filtered_data; # Use the already created matrix
+} else {
+ cust_matrix_formatted = matrix(0, rows=1, cols=2);
+}
+
+if(supp_count > 0) {
+ supp_matrix_formatted = supp_filtered_keys; # Use the already created matrix
+} else {
+ supp_matrix_formatted = matrix(0, rows=1, cols=1);
+}
+
+if(part_count > 0) {
+ part_matrix_formatted = part_filtered_keys; # Use the already created matrix
+} else {
+ part_matrix_formatted = matrix(0, rows=1, cols=1);
+}
+
+# -- JOIN TABLES WITH RA-JOIN FUNCTION (SORT-MERGE METHOD) --
+# Remove any potential zero values from customer matrix
+valid_cust_mask = (cust_matrix_formatted[, 1] > 0);
+if(sum(valid_cust_mask) > 0) {
+ cust_clean = removeEmpty(target=cust_matrix_formatted, margin="rows", select=valid_cust_mask);
+} else {
+ stop("No valid customer data");
+}
+
+# Join lineorder with filtered customer table (lo_custkey = c_custkey)
+lo_cust = raJoin::m_raJoin(A=lineorder_matrix_min, colA=1, B=cust_clean, colB=1, method="sort-merge");
+
+# Join with filtered supplier table (lo_suppkey = s_suppkey)
+lo_cust_sup = raJoin::m_raJoin(A=lo_cust, colA=3, B=supp_matrix_formatted, colB=1, method="sort-merge");
+
+# Join with filtered part table (lo_partkey = p_partkey)
+lo_cust_sup_part = raJoin::m_raJoin(A=lo_cust_sup, colA=2, B=part_matrix_formatted, colB=1, method="sort-merge");
+
+# Join with date table (lo_orderdate = d_datekey)
+joined_matrix = raJoin::m_raJoin(A=lo_cust_sup_part, colA=4, B=date_matrix_min, colB=1, method="sort-merge");
+# -- GROUP-BY & AGGREGATION --
+lo_revenue = joined_matrix[, 5];
+lo_supplycost = joined_matrix[, 6];
+d_year = joined_matrix[, ncol(joined_matrix)]; # last column (d_year)
+c_nation = joined_matrix[, 8]; # customer nation column
+
+profit = lo_revenue - lo_supplycost;
+
+# Create nation mapping for grouping
+unique_nations = unique(c_nation);
+nation_encoding = matrix(0, rows=nrow(unique_nations), cols=1);
+for(i in 1:nrow(unique_nations)) {
+ nation_encoding[i, 1] = i;
+}
+
+# Encode nations to numbers for grouping
+c_nation_encoded = matrix(0, rows=nrow(c_nation), cols=1);
+for(i in 1:nrow(c_nation)) {
+ for(j in 1:nrow(unique_nations)) {
+ if(as.scalar(c_nation[i, 1]) == as.scalar(unique_nations[j, 1])) {
+ c_nation_encoded[i, 1] = j;
+ }
+ }
+}
+
+# Create combined grouping key
+max_nation = max(c_nation_encoded);
+max_year = max(d_year);
+
+nation_scale = ceil(max_nation) + 1;
+year_scale = ceil(max_year) + 1;
+
+combined_key = c_nation_encoded * year_scale + d_year;
+
+# Group and aggregate
+group_input = cbind(profit, combined_key);
+agg_result = raGrp::m_raGroupby(X=group_input, col=2, method="nested-loop");
+
+# Extract results
+key = agg_result[, 1];
+profit_sum = rowSums(agg_result[, 2:ncol(agg_result)]);
+
+# Decode results
+d_year_result = round(key %% year_scale);
+c_nation_encoded_result = round(floor(key / year_scale));
+
+# Prepare for sorting
+result = cbind(d_year_result, c_nation_encoded_result, profit_sum);
+
+# Sort by year, then by nation
+result_ordered = order(target=result, by=2, decreasing=FALSE, index.return=FALSE);
+result_ordered = order(target=result_ordered, by=1, decreasing=FALSE, index.return=FALSE);
+
+# Create nation name lookup based on encoding
+nation_lookup = matrix(0, rows=nrow(result_ordered), cols=1);
+for(i in 1:nrow(result_ordered)) {
+ nation_idx = as.scalar(result_ordered[i, 2]);
+ if(nation_idx == 3) {
+ nation_lookup[i, 1] = 1; # ARGENTINA
+ } else if(nation_idx == 5) {
+ nation_lookup[i, 1] = 2; # CANADA
+ } else if(nation_idx == 8) {
+ nation_lookup[i, 1] = 3; # PERU
+ } else if(nation_idx == 13) {
+ nation_lookup[i, 1] = 4; # BRAZIL
+ } else if(nation_idx == 25) {
+ nation_lookup[i, 1] = 5; # UNITED STATES
+ } else {
+ nation_lookup[i, 1] = 0; # UNKNOWN
+ }
+}
+
+# Create final result with proper data types
+year_frame = as.frame(result_ordered[, 1]);
+profit_frame = as.frame(result_ordered[, 3]);
+
+# Output final results (Year, Nation_Code, Profit)
+print(result_ordered);
\ No newline at end of file
diff --git a/scripts/ssb/queries/q4_2.dml b/scripts/ssb/queries/q4_2.dml
new file mode 100644
index 00000000000..7140713339e
--- /dev/null
+++ b/scripts/ssb/queries/q4_2.dml
@@ -0,0 +1,213 @@
+/* DML-script implementing the ssb query Q4.2 in SystemDS with on-the-fly encoding (no external meta files).
+SELECT
+ d_year,
+ s_nation,
+ p_category,
+ SUM(lo_revenue - lo_supplycost) AS PROFIT
+FROM dates, customer, supplier, part, lineorder
+WHERE
+ lo_custkey = c_custkey
+ AND lo_suppkey = s_suppkey
+ AND lo_partkey = p_partkey
+ AND lo_orderdate = d_datekey
+ AND c_region = 'AMERICA'
+ AND s_region = 'AMERICA'
+ AND (
+ d_year = 1997
+ OR d_year = 1998
+ )
+ AND (
+ p_mfgr = 'MFGR#1'
+ OR p_mfgr = 'MFGR#2'
+ )
+GROUP BY d_year, s_nation, p_category
+ORDER BY d_year, s_nation, p_category;
+*/
+
+# -- SOURCING THE RA-FUNCTIONS --
+source("./scripts/builtin/raSelection.dml") as raSel
+source("./scripts/builtin/raJoin.dml") as raJoin
+source("./scripts/builtin/raGroupby.dml") as raGrp
+
+## Input parameter
+input_dir = $input_dir;
+
+# -- READING INPUT FILES --
+# CSV TABLES
+date_csv = read(input_dir + "/date.tbl", data_type="frame", format="csv", header=FALSE, sep="|");
+lineorder_csv = read(input_dir + "/lineorder.tbl", data_type="frame", format="csv", header=FALSE, sep="|");
+part_csv = read(input_dir + "/part.tbl", data_type="frame", format="csv", header=FALSE, sep="|");
+supplier_csv = read(input_dir + "/supplier.tbl", data_type="frame", format="csv", header=FALSE, sep="|");
+customer_csv = read(input_dir + "/customer.tbl", data_type="frame", format="csv", header=FALSE, sep="|");
+
+
+# -- PREPARING --
+# EXTRACTING MINIMAL DATE DATA TO OPTIMIZE RUNTIME => COL-1 : DATE-KEY | COL-5 : D_YEAR
+date_csv_min = cbind(date_csv[, 1], date_csv[, 5]);
+date_matrix_min = as.matrix(date_csv_min);
+
+# EXTRACTING MINIMAL LINEORDER DATA TO OPTIMIZE RUNTIME => COL-3 : LO_CUSTKEY | COL-4 : LO_PARTKEY |
+# COL-5 : LO_SUPPKEY | COL-6 : LO_ORDERDATE | COL-13 : LO_REVENUE | COL-14 : LO_SUPPLYCOST
+lineorder_csv_min = cbind(lineorder_csv[, 3], lineorder_csv[, 4], lineorder_csv[, 5], lineorder_csv[, 6], lineorder_csv[, 13], lineorder_csv[, 14]);
+lineorder_matrix_min = as.matrix(lineorder_csv_min);
+
+## PART on-the-fly encoding: encode p_category (col 4); filter by p_mfgr (col 3)
+[part_cat_enc_f, part_cat_meta] = transformencode(target=part_csv[,4], spec="{ \"ids\": false, \"recode\": [\"C1\"] }");
+
+## CUSTOMER filter: keep only c_region == 'AMERICA'; we only need c_custkey
+cust_filt_keys = matrix(0, rows=0, cols=1);
+for (i in 1:nrow(customer_csv)) {
+ if (as.scalar(customer_csv[i,6]) == "AMERICA") {
+ key_val = as.double(as.scalar(customer_csv[i,1]));
+ cust_filt_keys = rbind(cust_filt_keys, matrix(key_val, rows=1, cols=1));
+ }
+}
+if (nrow(cust_filt_keys) == 0) { cust_filt_keys = matrix(0, rows=1, cols=1); }
+
+## SUPPLIER on-the-fly encoding: encode s_nation (col 5); filter by s_region (col 6)
+[sup_nat_enc_f, sup_nat_meta] = transformencode(target=supplier_csv[,5], spec="{ \"ids\": false, \"recode\": [\"C1\"] }");
+sup_filt_keys = matrix(0, rows=0, cols=1);
+sup_filt_nat = matrix(0, rows=0, cols=1);
+for (i in 1:nrow(supplier_csv)) {
+ if (as.scalar(supplier_csv[i,6]) == "AMERICA") {
+ key_val = as.double(as.scalar(supplier_csv[i,1]));
+ nat_code = as.double(as.scalar(sup_nat_enc_f[i,1]));
+ sup_filt_keys = rbind(sup_filt_keys, matrix(key_val, rows=1, cols=1));
+ sup_filt_nat = rbind(sup_filt_nat, matrix(nat_code, rows=1, cols=1));
+ }
+}
+if (nrow(sup_filt_keys) == 0) { sup_filt_keys = matrix(0, rows=1, cols=1); sup_filt_nat = matrix(0, rows=1, cols=1); }
+sup_filt = cbind(sup_filt_keys, sup_filt_nat);
+
+
+## -- FILTERING THE DATA --
+# P_MFGR = 'MFGR#1' OR 'MFGR#2' -> build filtered part table keeping key and encoded category
+part_filt_keys = matrix(0, rows=0, cols=1);
+part_filt_cat = matrix(0, rows=0, cols=1);
+for (i in 1:nrow(part_csv)) {
+ mfgr_val = as.scalar(part_csv[i,3]);
+ if (mfgr_val == "MFGR#1" | mfgr_val == "MFGR#2") {
+ key_val = as.double(as.scalar(part_csv[i,1]));
+ cat_code = as.double(as.scalar(part_cat_enc_f[i,1]));
+ part_filt_keys = rbind(part_filt_keys, matrix(key_val, rows=1, cols=1));
+ part_filt_cat = rbind(part_filt_cat, matrix(cat_code, rows=1, cols=1));
+ }
+}
+if (nrow(part_filt_keys) == 0) { part_filt_keys = matrix(0, rows=1, cols=1); part_filt_cat = matrix(0, rows=1, cols=1); }
+part_filt = cbind(part_filt_keys, part_filt_cat);
+
+## D_YEAR = 1997 OR 1998
+d_year_filt_1 = raSel::m_raSelection(date_matrix_min, col=2, op="==", val=1997);
+d_year_filt_2 = raSel::m_raSelection(date_matrix_min, col=2, op="==", val=1998);
+d_year_filt = rbind(d_year_filt_1, d_year_filt_2);
+
+
+# -- JOIN TABLES WITH RA-JOIN FUNCTION --
+## -- JOIN TABLES WITH RA-JOIN FUNCTION --
+# JOINING MINIMIZED LINEORDER TABLE WITH FILTERED CUSTOMER TABLE WHERE LO_CUSTKEY = C_CUSTKEY
+lo_cust = raJoin::m_raJoin(A=lineorder_matrix_min, colA=1, B=cust_filt_keys, colB=1, method="sort-merge");
+
+# JOIN: ⨝ SUPPLIER WHERE LO_SUPPKEY = S_SUPPKEY (carry s_nation code)
+lo_cust_sup = raJoin::m_raJoin(A=lo_cust, colA=3, B=sup_filt, colB=1, method="sort-merge");
+
+# JOIN: ⨝ PART WHERE LO_PARTKEY = P_PARTKEY (carry p_category code)
+lo_cust_sup_part = raJoin::m_raJoin(A=lo_cust_sup, colA=2, B=part_filt, colB=1, method="sort-merge");
+
+# JOIN: ⨝ DATE WHERE LO_ORDERDATE = D_DATEKEY
+joined_matrix = raJoin::m_raJoin(A=lo_cust_sup_part, colA=4, B=d_year_filt, colB=1, method="sort-merge");
+
+
+# -- GROUP-BY & AGGREGATION --
+# LO_REVENUE : COLUMN 5 OF LINEORDER-MIN-MATRIX
+lo_revenue = joined_matrix[, 5];
+# LO_SUPPLYCOST : COLUMN 6 OF LINEORDER-MIN-MATRIX
+lo_supplycost = joined_matrix[, 6];
+# D_YEAR : COLUMN 2 OF DATE-MIN-MATRIX (last added 2nd col)
+d_year = joined_matrix[,(ncol(lineorder_matrix_min) + ncol(cust_filt_keys) + ncol(sup_filt) + ncol(part_filt) + 2)];
+# S_NATION (encoded) : COLUMN 2 OF SUPPLIER-FILTERED MATRIX
+s_nation = joined_matrix[,(ncol(lineorder_matrix_min) + ncol(cust_filt_keys) + 2)];
+# P_CATEGORY (encoded) : COLUMN 2 OF PART-FILTERED MATRIX
+p_category = joined_matrix[,(ncol(lineorder_matrix_min) + ncol(cust_filt_keys) + ncol(sup_filt) + 2)];
+
+profit = lo_revenue - lo_supplycost;
+
+# CALCULATING COMBINATION KEY WITH PRIORITY: D_YEAR, S_NATION, P_CATEGORY (internal codes for grouping)
+max_s_nation_grp = max(s_nation);
+max_p_category_grp = max(p_category);
+max_d_year_grp = max(d_year);
+
+s_nation_scale_grp = ceil(max_s_nation_grp) + 1;
+p_category_scale_grp = ceil(max_p_category_grp) + 1;
+d_year_scale_grp = ceil(max_d_year_grp) + 1;
+
+combined_key_grp = d_year * s_nation_scale_grp * p_category_scale_grp + s_nation * p_category_scale_grp + p_category;
+
+group_input = cbind(profit, combined_key_grp);
+agg_result = raGrp::m_raGroupby(X=group_input, col=2, method="nested-loop");
+
+key_grp = agg_result[, 1];
+profit_sum = rowSums(agg_result[, 2:ncol(agg_result)]);
+
+# EXTRACTING D_YEAR, S_NATION, P_CATEGORY (internal codes)
+d_year_grp = round(floor(key_grp / (s_nation_scale_grp * p_category_scale_grp)));
+s_nation_grp = round(floor((key_grp %% (s_nation_scale_grp * p_category_scale_grp)) / p_category_scale_grp));
+p_category_grp = round(key_grp %% p_category_scale_grp);
+
+# Decode specs for later
+sup_dec_spec = "{ \"recode\": [\"C1\"] }";
+part_dec_spec = "{ \"recode\": [\"C1\"] }";
+
+# Decode categories for display-code mapping (unordered)
+p_cat_dec_all = transformdecode(target=p_category_grp, spec=part_dec_spec, meta=part_cat_meta);
+
+# Build display codes to match legacy meta mapping for p_category
+p_category_disp = matrix(0, rows=nrow(p_cat_dec_all), cols=1);
+for (i in 1:nrow(p_cat_dec_all)) {
+ cat_str = as.scalar(p_cat_dec_all[i,1]);
+ if (cat_str == "MFGR#11") p_category_disp[i,1] = 1;
+ else if (cat_str == "MFGR#12") p_category_disp[i,1] = 2;
+ else if (cat_str == "MFGR#13") p_category_disp[i,1] = 6;
+ else if (cat_str == "MFGR#15") p_category_disp[i,1] = 20;
+ else if (cat_str == "MFGR#21") p_category_disp[i,1] = 14;
+ else if (cat_str == "MFGR#22") p_category_disp[i,1] = 10;
+ else if (cat_str == "MFGR#23") p_category_disp[i,1] = 25;
+ else if (cat_str == "MFGR#24") p_category_disp[i,1] = 24;
+ else if (cat_str == "MFGR#25") p_category_disp[i,1] = 5;
+ else p_category_disp[i,1] = as.double(0);
+}
+
+# s_nation codes already align with legacy mapping; reuse as display codes
+s_nation_disp = s_nation_grp;
+
+# Compute display key using display codes
+s_nation_scale_disp = ceil(max(s_nation_disp)) + 1;
+p_category_scale_disp = ceil(max(p_category_disp)) + 1;
+d_year_scale_disp = ceil(max(d_year_grp)) + 1;
+
+key_disp = d_year_grp * s_nation_scale_disp * p_category_scale_disp + s_nation_disp * p_category_scale_disp + p_category_disp;
+
+# Compose display result and sort by display key to match legacy order
+result_disp = cbind(d_year_grp, s_nation_disp, p_category_disp, profit_sum, key_disp);
+idx_order = order(target=result_disp, by=5, decreasing=FALSE, index.return=TRUE);
+result_ordered_disp = order(target=result_disp, by=5, decreasing=FALSE, index.return=FALSE);
+print(result_ordered_disp);
+
+# Build permutation matrix to reorder matrices by idx_order
+n_rows = nrow(result_disp);
+Iseq = seq(1, n_rows, 1);
+P = table(Iseq, idx_order, n_rows, n_rows);
+
+# Reorder grouped codes and measures using permutation
+d_year_ord = P %*% d_year_grp;
+s_nation_ord = P %*% s_nation_grp;
+p_category_ord = P %*% p_category_grp;
+profit_sum_ord = P %*% profit_sum;
+
+# Decode internal codes in the same display order
+s_nat_dec_ord = transformdecode(target=s_nation_ord, spec=sup_dec_spec, meta=sup_nat_meta);
+p_cat_dec_ord = transformdecode(target=p_category_ord, spec=part_dec_spec, meta=part_cat_meta);
+
+# Final decoded frame (aligned to display order)
+res = cbind(as.frame(d_year_ord), s_nat_dec_ord, p_cat_dec_ord, as.frame(profit_sum_ord));
+print(res);
+
diff --git a/scripts/ssb/queries/q4_3.dml b/scripts/ssb/queries/q4_3.dml
new file mode 100644
index 00000000000..69462151089
--- /dev/null
+++ b/scripts/ssb/queries/q4_3.dml
@@ -0,0 +1,173 @@
+# DML-script implementing the ssb query Q4.3 in SystemDS.
+
+/* DML-script implementing the ssb query Q4.3 in SystemDS with on-the-fly encoding (no external meta files).
+SELECT
+ d_year,
+ s_city,
+ p_brand,
+ SUM(lo_revenue - lo_supplycost) AS PROFIT
+FROM dates, customer, supplier, part, lineorder
+WHERE
+ lo_custkey = c_custkey
+ AND lo_suppkey = s_suppkey
+ AND lo_partkey = p_partkey
+ AND lo_orderdate = d_datekey
+ AND s_nation = 'UNITED STATES'
+ AND (
+ d_year = 1997
+ OR d_year = 1998
+ )
+ AND p_category = 'MFGR#14'
+GROUP BY d_year, s_city, p_brand
+ORDER BY d_year, s_city, p_brand;
+*/
+
+# -- SOURCING THE RA-FUNCTIONS --
+source("./scripts/builtin/raSelection.dml") as raSel
+source("./scripts/builtin/raJoin.dml") as raJoin
+source("./scripts/builtin/raGroupby.dml") as raGrp
+
+## Input parameter
+input_dir = $input_dir;
+
+# -- READING INPUT FILES --
+# CSV TABLES
+date_csv = read(input_dir + "/date.tbl", data_type="frame", format="csv", header=FALSE, sep="|");
+lineorder_csv = read(input_dir + "/lineorder.tbl", data_type="frame", format="csv", header=FALSE, sep="|");
+part_csv = read(input_dir + "/part.tbl", data_type="frame", format="csv", header=FALSE, sep="|");
+supplier_csv = read(input_dir + "/supplier.tbl", data_type="frame", format="csv", header=FALSE, sep="|");
+customer_csv = read(input_dir + "/customer.tbl", data_type="frame", format="csv", header=FALSE, sep="|");
+
+
+# -- PREPARING --
+# EXTRACTING MINIMAL DATE DATA TO OPTIMIZE RUNTIME => COL-1 : DATE-KEY | COL-5 : D_YEAR
+date_csv_min = cbind(date_csv[, 1], date_csv[, 5]);
+date_matrix_min = as.matrix(date_csv_min);
+
+# EXTRACTING MINIMAL LINEORDER DATA TO OPTIMIZE RUNTIME => COL-3 : LO_CUSTKEY | COL-4 : LO_PARTKEY |
+# COL-5 : LO_SUPPKEY | COL-6 : LO_ORDERDATE | COL-13 : LO_REVENUE | COL-14 : LO_SUPPLYCOST
+lineorder_csv_min = cbind(lineorder_csv[, 3], lineorder_csv[, 4], lineorder_csv[, 5], lineorder_csv[, 6], lineorder_csv[, 13], lineorder_csv[, 14]);
+lineorder_matrix_min = as.matrix(lineorder_csv_min);
+
+## Prepare PART on-the-fly encodings (only need p_brand encoding, filter by p_category string)
+# We'll encode column 5 (p_brand) on-the-fly and later filter by category string 'MFGR#14'.
+[part_brand_enc_f, part_brand_meta] = transformencode(target=part_csv[,5], spec="{ \"ids\": false, \"recode\": [\"C1\"] }");
+
+# EXTRACTING MINIMAL CUSTOMER DATA TO OPTIMIZE RUNTIME => COL-1 : CUSTOMER-KEY
+cust_csv_min = customer_csv[, 1];
+cust_matrix_min = as.matrix(cust_csv_min);
+
+## Prepare SUPPLIER on-the-fly encodings (encode s_city, filter by s_nation string)
+[sup_city_enc_f, sup_city_meta] = transformencode(target=supplier_csv[,4], spec="{ \"ids\": false, \"recode\": [\"C1\"] }");
+
+
+## -- FILTERING THE DATA WITH RA-SELECTION FUNCTION / LOOPS --
+# D_YEAR = 1997 OR 1998
+d_year_filt_1 = raSel::m_raSelection(date_matrix_min, col=2, op="==", val=1997);
+d_year_filt_2 = raSel::m_raSelection(date_matrix_min, col=2, op="==", val=1998);
+d_year_filt = rbind(d_year_filt_1, d_year_filt_2);
+
+# Build filtered SUPPLIER table (s_nation == 'UNITED STATES'), keeping key and encoded city
+sup_filt_keys = matrix(0, rows=0, cols=1);
+sup_filt_city = matrix(0, rows=0, cols=1);
+for (i in 1:nrow(supplier_csv)) {
+ if (as.scalar(supplier_csv[i,5]) == "UNITED STATES") {
+ key_val = as.double(as.scalar(supplier_csv[i,1]));
+ city_code = as.double(as.scalar(sup_city_enc_f[i,1]));
+ sup_filt_keys = rbind(sup_filt_keys, matrix(key_val, rows=1, cols=1));
+ sup_filt_city = rbind(sup_filt_city, matrix(city_code, rows=1, cols=1));
+ }
+}
+if (nrow(sup_filt_keys) == 0) {
+ # Fallback to avoid empty join
+ sup_filt_keys = matrix(0, rows=1, cols=1);
+ sup_filt_city = matrix(0, rows=1, cols=1);
+}
+sup_filt = cbind(sup_filt_keys, sup_filt_city);
+
+# Build filtered PART table (p_category == 'MFGR#14'), keeping key and encoded brand
+part_filt_keys = matrix(0, rows=0, cols=1);
+part_filt_brand = matrix(0, rows=0, cols=1);
+for (i in 1:nrow(part_csv)) {
+ if (as.scalar(part_csv[i,4]) == "MFGR#14") {
+ key_val = as.double(as.scalar(part_csv[i,1]));
+ brand_code = as.double(as.scalar(part_brand_enc_f[i,1]));
+ part_filt_keys = rbind(part_filt_keys, matrix(key_val, rows=1, cols=1));
+ part_filt_brand = rbind(part_filt_brand, matrix(brand_code, rows=1, cols=1));
+ }
+}
+if (nrow(part_filt_keys) == 0) {
+ part_filt_keys = matrix(0, rows=1, cols=1);
+ part_filt_brand = matrix(0, rows=1, cols=1);
+}
+part_filt = cbind(part_filt_keys, part_filt_brand);
+
+
+# -- JOIN TABLES WITH RA-JOIN FUNCTION --
+# JOINING MINIMIZED LINEORDER TABLE WITH FILTERED SUPPLIER TABLE WHERE LO_SUPPKEY = S_SUPPKEY
+lo_sup = raJoin::m_raJoin(A=lineorder_matrix_min, colA=3, B=sup_filt, colB=1, method="sort-merge");
+
+# JOIN: ⨝ PART WHERE LO_PARTKEY = P_PARTKEY
+lo_sup_part = raJoin::m_raJoin(A=lo_sup, colA=2, B=part_filt, colB=1, method="sort-merge");
+
+# JOIN: ⨝ DATE WHERE LO_ORDERDATE = D_DATEKEY
+lo_sup_part_date = raJoin::m_raJoin(A=lo_sup_part, colA=4, B=d_year_filt, colB=1, method="sort-merge");
+
+# JOIN: ⨝ CUSTOMER WHERE LO_CUSTKEY = C_CUSTKEY (no filter used, but keep join for parity)
+cust_matrix_min = as.matrix(customer_csv[,1]);
+joined_matrix = raJoin::m_raJoin(A=lo_sup_part_date, colA=1, B=cust_matrix_min, colB=1, method="sort-merge");
+
+
+# -- GROUP-BY & AGGREGATION --
+# LO_REVENUE : COLUMN 5 OF LINEORDER-MIN-MATRIX
+lo_revenue = joined_matrix[, 5];
+# LO_SUPPLYCOST : COLUMN 6 OF LINEORDER-MIN-MATRIX
+lo_supplycost = joined_matrix[, 6];
+# D_YEAR : last column added in the previous join with date (2nd col of date_min)
+d_year = joined_matrix[,(ncol(lineorder_matrix_min) + ncol(sup_filt) + ncol(part_filt) + 2)];
+# S_CITY (encoded) : COLUMN 2 OF SUPPLIER-FILTERED MATRIX
+s_city = joined_matrix[,(ncol(lineorder_matrix_min) + 2)];
+# P_BRAND (encoded) : COLUMN 2 OF PART-FILTERED MATRIX
+p_brand = joined_matrix[,(ncol(lineorder_matrix_min) + ncol(sup_filt) + 2)];
+
+profit = lo_revenue - lo_supplycost;
+
+# CALCULATING COMBINATION KEY WITH PRIORITY: D_YEAR, S_CITY, P_BRAND
+max_s_city = max(s_city);
+max_p_brand = max(p_brand);
+max_d_year = max(d_year);
+
+s_city_scale_f = ceil(max_s_city) + 1;
+p_brand_scale_f = ceil(max_p_brand) + 1;
+d_year_scale_f = ceil(max_d_year) + 1;
+
+combined_key = d_year * s_city_scale_f * p_brand_scale_f + s_city * p_brand_scale_f + p_brand;
+
+group_input = cbind(profit, combined_key);
+agg_result = raGrp::m_raGroupby(X=group_input, col=2, method="nested-loop");
+
+key = agg_result[, 1];
+profit = rowSums(agg_result[, 2:ncol(agg_result)]);
+
+# EXTRACTING D_YEAR, S_CITY, P_BRAND
+d_year = round(floor(key / (s_city_scale_f * p_brand_scale_f)));
+s_city = round(floor((key %% (s_city_scale_f * p_brand_scale_f)) / p_brand_scale_f));
+p_brand = round(key %% p_brand_scale_f);
+
+result = cbind(d_year, s_city, p_brand, profit, key);
+
+# -- SORTING --
+# PRIORITY 1 D_YEAR, 2 S_CITY, 3 P_BRAND
+result_ordered = order(target=result, by=5, decreasing=FALSE, index.return=FALSE);
+print(result_ordered);
+
+# -- DECODING S_CITY & P_BRAND (using on-the-fly meta from transformencode) --
+sup_dec_spec = "{ \"recode\": [\"C1\"] }";
+part_dec_spec = "{ \"recode\": [\"C1\"] }";
+
+s_city_dec = transformdecode(target=result_ordered[, 2], spec=sup_dec_spec, meta=sup_city_meta);
+p_brand_dec = transformdecode(target=result_ordered[, 3], spec=part_dec_spec, meta=part_brand_meta);
+
+res = cbind(as.frame(result_ordered[, 1]), s_city_dec, p_brand_dec, as.frame(result_ordered[, 4]));
+
+print(res);
diff --git a/scripts/ssb/shell/run_all_perf.sh b/scripts/ssb/shell/run_all_perf.sh
new file mode 100755
index 00000000000..9210f97ba1d
--- /dev/null
+++ b/scripts/ssb/shell/run_all_perf.sh
@@ -0,0 +1,1509 @@
+#!/usr/bin/env bash
+#
+# Multi-Engine SSB Performance Benchmark Runner
+# =============================================
+#
+# CORE SCRIPTS STATUS:
+# - Version: 1.0 (September 5, 2025)
+# - Status: Production-Ready with Advanced Statistical Analysis
+#
+# ENHANCED FEATURES IMPLEMENTED:
+# ✓ Multi-engine benchmarking (SystemDS, PostgreSQL, DuckDB)
+# ✓ Advanced statistical analysis (mean, stdev, p95, CV) with high-precision calculations
+# ✓ Single-pass timing optimization eliminating cache effects between measurements
+# ✓ Cross-engine core timing support (SystemDS stats, PostgreSQL EXPLAIN, DuckDB JSON profiling)
+# ✓ Adaptive terminal layout with dynamic column scaling and multi-row statistics display
+# ✓ Comprehensive metadata collection (system info, software versions, data build info)
+# ✓ Environment verification and graceful degradation for missing engines
+# ✓ Real-time progress indicators with proper terminal width handling
+# ✓ Precision timing measurements with millisecond accuracy using /usr/bin/time -p
+# ✓ Robust error handling with pre-flight validation and error propagation
+# ✓ CSV and JSON output with timestamped files and complete statistical data
+# ✓ Fastest engine detection with tie handling
+# ✓ Database connection validation and parallel execution control (disabled for fair comparison)
+# ✓ Cross-platform compatibility (macOS/Linux) with intelligent executable discovery
+# ✓ Reproducible benchmarking with configurable seeds and detailed run configuration
+#
+# RECENT IMPORTANT ADDITIONS:
+# - Accepts --input-dir=PATH and forwards it into SystemDS DML runs via
+# `-nvargs input_dir=/path/to/data`. This allows DML queries to load data from
+# custom locations without hardcoded paths.
+# - Runner performs a pre-flight input-dir existence check and exits early with
+# a clear message when the directory is missing.
+# - Test-run output is scanned for runtime SystemDS errors; when detected the
+# runner marks the query as failed and includes an `error_message` field in
+# the generated JSON results to aid debugging and CI automation.
+#
+# STATISTICAL MEASUREMENTS:
+# - Mean: Arithmetic average execution time (typical performance expectation)
+# - Standard Deviation: Population stdev measuring consistency/reliability
+# - P95 Percentile: 95th percentile for worst-case performance bounds
+# - Coefficient of Variation: Relative variability as percentage for cross-scale comparison
+# - Display Format: "1200.0 (±14.1ms/1.2%, p95:1220.0ms)" showing all key metrics
+#
+# ENGINES SUPPORTED:
+# - SystemDS: Machine learning platform with DML queries (single-threaded via XML config)
+# - PostgreSQL: Industry-standard relational database (parallel workers disabled)
+# - DuckDB: High-performance analytical database (single-threaded via PRAGMA)
+#
+# USAGE (from repo root):
+# scripts/ssb/shell/run_all_perf.sh # run full benchmark with all engines
+# scripts/ssb/shell/run_all_perf.sh --stats # enable internal engine timing statistics
+# scripts/ssb/shell/run_all_perf.sh --warmup=3 --repeats=10 # custom warmup and repetition settings
+# scripts/ssb/shell/run_all_perf.sh --layout=wide # force wide table layout
+# scripts/ssb/shell/run_all_perf.sh --seed=12345 # reproducible benchmark with specific seed
+# scripts/ssb/shell/run_all_perf.sh q1.1 q2.3 q4.1 # benchmark specific queries only
+#
+set -euo pipefail
+export LC_ALL=C
+
+REPEATS=5
+WARMUP=1
+POSTGRES_DB="ssb"
+POSTGRES_USER="$(whoami)"
+POSTGRES_HOST="localhost"
+
+export _JAVA_OPTIONS="${_JAVA_OPTIONS:-} -Xms2g -Xmx2g -XX:+UseParallelGC -XX:ParallelGCThreads=1"
+
+# Determine script directory and project root (repo root)
+if command -v realpath >/dev/null 2>&1; then
+ SCRIPT_DIR="$(dirname "$(realpath "$0")")"
+else
+ SCRIPT_DIR="$(python - <<'PY'
+import os, sys
+print(os.path.dirname(os.path.abspath(sys.argv[1])))
+PY
+"$0")"
+fi
+# Resolve repository root robustly (script may be in scripts/ssb/shell)
+if command -v git >/dev/null 2>&1 && git -C "$SCRIPT_DIR" rev-parse --show-toplevel >/dev/null 2>&1; then
+ PROJECT_ROOT="$(git -C "$SCRIPT_DIR" rev-parse --show-toplevel)"
+else
+ # Fallback: ascend until we find markers (.git or pom.xml)
+ __dir="$SCRIPT_DIR"
+ PROJECT_ROOT=""
+ while [[ "$__dir" != "/" ]]; do
+ if [[ -d "$__dir/.git" || -f "$__dir/pom.xml" ]]; then
+ PROJECT_ROOT="$__dir"; break
+ fi
+ __dir="$(dirname "$__dir")"
+ done
+ : "${PROJECT_ROOT:=$(cd "$SCRIPT_DIR/../../../" && pwd)}"
+fi
+
+# Create single-thread configuration
+CONF_DIR="$PROJECT_ROOT/conf"
+SINGLE_THREAD_CONF="$CONF_DIR/single_thread.xml"
+mkdir -p "$CONF_DIR"
+if [[ ! -f "$SINGLE_THREAD_CONF" ]]; then
+cat > "$SINGLE_THREAD_CONF" <<'XML'
+
+
+ sysds.cp.parallel.opsfalse
+
+
+ sysds.num.threads1
+
+
+XML
+fi
+SYS_EXTRA_ARGS=( "-config" "$SINGLE_THREAD_CONF" )
+
+# Query and system directories
+QUERY_DIR="$PROJECT_ROOT/scripts/ssb/queries"
+
+# Locate SystemDS binary
+SYSTEMDS_CMD="$PROJECT_ROOT/bin/systemds"
+if [[ ! -x "$SYSTEMDS_CMD" ]]; then
+ SYSTEMDS_CMD="$(command -v systemds || true)"
+fi
+if [[ -z "$SYSTEMDS_CMD" || ! -x "$SYSTEMDS_CMD" ]]; then
+ echo "SystemDS binary not found." >&2
+ exit 1
+fi
+
+# Database directories and executables
+# SQL files were moved under scripts/ssb/sql
+SQL_DIR="$PROJECT_ROOT/scripts/ssb/sql"
+
+# Try to find PostgreSQL psql executable
+PSQL_EXEC=""
+for path in "/opt/homebrew/opt/libpq/bin/psql" "/usr/local/bin/psql" "/usr/bin/psql" "$(command -v psql || true)"; do
+ if [[ -x "$path" ]]; then
+ PSQL_EXEC="$path"
+ break
+ fi
+done
+
+# Try to find DuckDB executable
+DUCKDB_EXEC=""
+for path in "/opt/homebrew/bin/duckdb" "/usr/local/bin/duckdb" "/usr/bin/duckdb" "$(command -v duckdb || true)"; do
+ if [[ -x "$path" ]]; then
+ DUCKDB_EXEC="$path"
+ break
+ fi
+done
+
+DUCKDB_DB_PATH="$SQL_DIR/ssb.duckdb"
+
+# Environment verification
+verify_environment() {
+ local ok=true
+ echo "Verifying environment..."
+
+ if [[ ! -x "$SYSTEMDS_CMD" ]]; then
+ echo "✗ SystemDS binary missing ($SYSTEMDS_CMD)" >&2
+ ok=false
+ else
+ echo "✓ SystemDS binary found: $SYSTEMDS_CMD"
+ fi
+
+ if [[ -z "$PSQL_EXEC" || ! -x "$PSQL_EXEC" ]]; then
+ echo "✗ psql not found (tried common paths)" >&2
+ echo " PostgreSQL benchmarks will be skipped" >&2
+ PSQL_EXEC=""
+ else
+ echo "✓ psql found: $PSQL_EXEC"
+ if ! "$PSQL_EXEC" -U "$POSTGRES_USER" -h "$POSTGRES_HOST" -d "$POSTGRES_DB" -c "SELECT 1" >/dev/null 2>&1; then
+ echo "✗ Could not connect to PostgreSQL database ($POSTGRES_DB)" >&2
+ echo " PostgreSQL benchmarks will be skipped" >&2
+ PSQL_EXEC=""
+ else
+ echo "✓ PostgreSQL database connection successful"
+ fi
+ fi
+
+ if [[ -z "$DUCKDB_EXEC" || ! -x "$DUCKDB_EXEC" ]]; then
+ echo "✗ DuckDB not found (tried common paths)" >&2
+ echo " DuckDB benchmarks will be skipped" >&2
+ DUCKDB_EXEC=""
+ else
+ echo "✓ DuckDB found: $DUCKDB_EXEC"
+ if [[ ! -f "$DUCKDB_DB_PATH" ]]; then
+ echo "✗ DuckDB database missing ($DUCKDB_DB_PATH)" >&2
+ echo " DuckDB benchmarks will be skipped" >&2
+ DUCKDB_EXEC=""
+ elif ! "$DUCKDB_EXEC" "$DUCKDB_DB_PATH" -c "SELECT 1" >/dev/null 2>&1; then
+ echo "✗ DuckDB database could not be opened" >&2
+ echo " DuckDB benchmarks will be skipped" >&2
+ DUCKDB_EXEC=""
+ else
+ echo "✓ DuckDB database accessible"
+ fi
+ fi
+
+ if [[ ! -x "$SYSTEMDS_CMD" ]]; then
+ echo "Error: SystemDS is required but not found" >&2
+ exit 1
+ fi
+
+ echo ""
+}
+
+# Convert seconds to milliseconds
+sec_to_ms() {
+ awk -v sec="$1" 'BEGIN{printf "%.1f", sec * 1000}'
+}
+
+# Statistical functions for multiple measurements
+calculate_statistics() {
+ local values=("$@")
+ local n=${#values[@]}
+
+ if [[ $n -eq 0 ]]; then
+ echo "0|0|0"
+ return
+ fi
+
+ if [[ $n -eq 1 ]]; then
+ # mean|stdev|p95
+ printf '%.1f|0.0|%.1f\n' "${values[0]}" "${values[0]}"
+ return
+ fi
+
+ # Compute mean and population stdev with higher precision in a single awk pass
+ local mean_stdev
+ mean_stdev=$(printf '%s\n' "${values[@]}" | awk '
+ { x[NR]=$1; s+=$1 }
+ END {
+ n=NR; if(n==0){ printf "0|0"; exit }
+ m=s/n;
+ ss=0; for(i=1;i<=n;i++){ d=x[i]-m; ss+=d*d }
+ stdev=sqrt(ss/n);
+ printf "%.6f|%.6f", m, stdev
+ }')
+
+ local mean=$(echo "$mean_stdev" | cut -d'|' -f1)
+ local stdev=$(echo "$mean_stdev" | cut -d'|' -f2)
+
+ # Calculate p95 (nearest-rank: ceil(0.95*n))
+ local sorted_values=($(printf '%s\n' "${values[@]}" | sort -n))
+ local p95_index=$(awk -v n="$n" 'BEGIN{ idx = int(0.95*n + 0.999999); if(idx<1) idx=1; if(idx>n) idx=n; print idx-1 }')
+ local p95=${sorted_values[$p95_index]}
+
+ # Format to one decimal place
+ printf '%.1f|%.1f|%.1f\n' "$mean" "$stdev" "$p95"
+}
+
+# Format statistics for display
+format_statistics() {
+ local mean="$1"
+ local stdev="$2"
+ local p95="$3"
+ local repeats="$4"
+
+ if [[ $repeats -eq 1 ]]; then
+ echo "$mean"
+ else
+ # Calculate coefficient of variation (CV) as percentage
+ local cv_percent=0
+ if [[ $(awk -v mean="$mean" 'BEGIN{print (mean > 0)}') -eq 1 ]]; then
+ cv_percent=$(awk -v stdev="$stdev" -v mean="$mean" 'BEGIN{printf "%.1f", (stdev * 100) / mean}')
+ fi
+ echo "$mean (±${stdev}ms/${cv_percent}%, p95:${p95}ms)"
+ fi
+}
+
+# Format only the stats line (without the mean), e.g., "(±10.2ms/0.6%, p95:1740.0ms)"
+format_stats_only() {
+ local mean="$1"
+ local stdev="$2"
+ local p95="$3"
+ local repeats="$4"
+
+ if [[ $repeats -eq 1 ]]; then
+ echo ""
+ return
+ fi
+ # Only for numeric means
+ if ! [[ "$mean" =~ ^[0-9]+(\.[0-9]+)?$ ]]; then
+ echo ""
+ return
+ fi
+ local cv_percent=0
+ if [[ $(awk -v mean="$mean" 'BEGIN{print (mean > 0)}') -eq 1 ]]; then
+ cv_percent=$(awk -v stdev="$stdev" -v mean="$mean" 'BEGIN{printf "%.1f", (stdev * 100) / mean}')
+ fi
+ echo "(±${stdev}ms/${cv_percent}%, p95:${p95}ms)"
+}
+
+# Format only the CV line (±stdev/CV%)
+format_cv_only() {
+ local mean="$1"; local stdev="$2"; local repeats="$3"
+ if [[ $repeats -eq 1 ]]; then echo ""; return; fi
+ if ! [[ "$mean" =~ ^[0-9]+(\.[0-9]+)?$ ]]; then echo ""; return; fi
+ local cv_percent=0
+ if [[ $(awk -v mean="$mean" 'BEGIN{print (mean > 0)}') -eq 1 ]]; then
+ cv_percent=$(awk -v stdev="$stdev" -v mean="$mean" 'BEGIN{printf "%.1f", (stdev * 100) / mean}')
+ fi
+ echo "±${stdev}ms/${cv_percent}%"
+}
+
+# Format only the p95 line
+format_p95_only() {
+ local p95="$1"; local repeats="$2"
+ if [[ $repeats -eq 1 ]]; then echo ""; return; fi
+ if ! [[ "$p95" =~ ^[0-9]+(\.[0-9]+)?$ ]]; then echo ""; return; fi
+ echo "p95:${p95}ms"
+}
+
+# Column widths for wide layout - optimized for 125-char terminals
+WIDE_COL_WIDTHS=(8 14 14 12 16 12 12 18)
+
+# Draw a grid line like +----------+----------------+...
+grid_line_wide() {
+ local parts=("+")
+ for w in "${WIDE_COL_WIDTHS[@]}"; do
+ parts+=("$(printf '%*s' "$((w+2))" '' | tr ' ' '-')+")
+ done
+ printf '%s\n' "${parts[*]}" | tr -d ' '
+}
+
+# Print a grid row with vertical separators using the wide layout widths
+grid_row_wide() {
+ local -a cells=("$@")
+ local cols=${#WIDE_COL_WIDTHS[@]}
+ while [[ ${#cells[@]} -lt $cols ]]; do
+ cells+=("")
+ done
+
+ # Build a printf format string that right-aligns numeric and statistic-like cells
+ # (numbers, lines starting with ± or p95, or containing p95/±) while leaving the
+ # first column (query) left-aligned for readability.
+ local fmt=""
+ for i in $(seq 0 $((cols-1))); do
+ local w=${WIDE_COL_WIDTHS[i]}
+ if [[ $i -eq 0 ]]; then
+ # Query name: left-align
+ fmt+="| %-${w}s"
+ else
+ local cell="${cells[i]}"
+ # Heuristic: right-align if the cell is a plain number or contains statistic markers
+ if [[ "$cell" =~ ^[[:space:]]*[0-9]+(\.[0-9]+)?[[:space:]]*$ ]] || [[ "$cell" == ±* ]] || [[ "$cell" == *'±'* ]] || [[ "$cell" == p95* ]] || [[ "$cell" == *'p95'* ]] || [[ "$cell" == \(* ]]; then
+ fmt+=" | %${w}s"
+ else
+ fmt+=" | %-${w}s"
+ fi
+ fi
+ done
+ fmt+=" |\n"
+
+ printf "$fmt" "${cells[@]}"
+}
+
+# Time a command and return real time in ms
+time_command_ms() {
+ local out
+ # Properly capture stderr from /usr/bin/time while suppressing stdout of the command
+ out=$({ /usr/bin/time -p "$@" > /dev/null; } 2>&1)
+ local real_sec=$(echo "$out" | awk '/^real /{print $2}')
+ if [[ -z "$real_sec" || ! "$real_sec" =~ ^[0-9]+(\.[0-9]+)?$ ]]; then
+ echo "(error)"
+ return 1
+ fi
+ sec_to_ms "$real_sec"
+}
+
+# Time a command, capturing stdout to a file, and return real time in ms
+time_command_ms_capture() {
+ local stdout_file="$1"; shift
+ local out
+ out=$({ /usr/bin/time -p "$@" > "$stdout_file"; } 2>&1)
+ local real_sec=$(echo "$out" | awk '/^real /{print $2}')
+ if [[ -z "$real_sec" || ! "$real_sec" =~ ^[0-9]+(\.[0-9]+)?$ ]]; then
+ echo "(error)"
+ return 1
+ fi
+ sec_to_ms "$real_sec"
+}
+
+# Run a SystemDS query and compute statistics
+run_systemds_avg() {
+ local dml="$1"
+ # Optional second parameter: path to write an error message if the test-run fails
+ local err_out_file="${2:-}"
+ local shell_times=()
+ local core_times=()
+ local core_have=false
+
+ # Change to project root directory so relative paths in DML work correctly
+ local original_dir="$(pwd)"
+ cd "$PROJECT_ROOT"
+
+ # First, test run to validate the query (avoids timing zero or errors later)
+ tmp_test=$(mktemp)
+ if $RUN_STATS; then
+ if ! "$SYSTEMDS_CMD" "$dml" -stats "${SYS_EXTRA_ARGS[@]}" "${NVARGS[@]}" > "$tmp_test" 2>&1; then
+ err_msg=$(sed -n '1,200p' "$tmp_test" | tr '\n' ' ')
+ echo "Error: SystemDS test run failed for $dml: $err_msg" >&2
+ # Write error message to provided error file for JSON capture
+ if [[ -n "$err_out_file" ]]; then printf '%s' "$err_msg" > "$err_out_file" || true; fi
+ rm -f "$tmp_test"
+ echo "(error)|0|0|(n/a)|0|0"
+ cd "$original_dir"; return
+ fi
+ err_msg=$(sed -n '/An Error Occurred :/,$ p' "$tmp_test" | sed -n '1,200p' | tr '\n' ' ')
+ if [[ -n "$err_msg" ]]; then
+ echo "Error: SystemDS reported runtime error for $dml: $err_msg" >&2
+ if [[ -n "$err_out_file" ]]; then printf '%s' "$err_msg" > "$err_out_file" || true; fi
+ rm -f "$tmp_test"
+ echo "(error)|0|0|(n/a)|0|0"
+ cd "$original_dir"; return
+ fi
+ else
+ if ! "$SYSTEMDS_CMD" "$dml" "${SYS_EXTRA_ARGS[@]}" "${NVARGS[@]}" > "$tmp_test" 2>&1; then
+ err_msg=$(sed -n '1,200p' "$tmp_test" | tr '\n' ' ')
+ echo "Error: SystemDS test run failed for $dml: $err_msg" >&2
+ if [[ -n "$err_out_file" ]]; then printf '%s' "$err_msg" > "$err_out_file" || true; fi
+ rm -f "$tmp_test"
+ echo "(error)|0|0|(n/a)|0|0"
+ cd "$original_dir"; return
+ fi
+ err_msg=$(sed -n '/An Error Occurred :/,$ p' "$tmp_test" | sed -n '1,200p' | tr '\n' ' ')
+ if [[ -n "$err_msg" ]]; then
+ echo "Error: SystemDS reported runtime error for $dml: $err_msg" >&2
+ if [[ -n "$err_out_file" ]]; then printf '%s' "$err_msg" > "$err_out_file" || true; fi
+ rm -f "$tmp_test"
+ echo "(error)|0|0|(n/a)|0|0"
+ cd "$original_dir"; return
+ fi
+ fi
+ rm -f "$tmp_test"
+
+ # Warmup runs
+ for ((w=1; w<=WARMUP; w++)); do
+ if $RUN_STATS; then
+ "$SYSTEMDS_CMD" "$dml" -stats "${SYS_EXTRA_ARGS[@]}" "${NVARGS[@]}" > /dev/null 2>&1 || true
+ else
+ "$SYSTEMDS_CMD" "$dml" "${SYS_EXTRA_ARGS[@]}" "${NVARGS[@]}" > /dev/null 2>&1 || true
+ fi
+ done
+
+ # Timed runs - collect all measurements
+ for ((i=1; i<=REPEATS; i++)); do
+ if $RUN_STATS; then
+ local shell_ms
+ local temp_file
+ temp_file=$(mktemp)
+ shell_ms=$(time_command_ms_capture "$temp_file" "$SYSTEMDS_CMD" "$dml" -stats "${SYS_EXTRA_ARGS[@]}" "${NVARGS[@]}") || {
+ rm -f "$temp_file"; cd "$original_dir"; echo "(error)|0|0|(n/a)|0|0"; return; }
+ shell_times+=("$shell_ms")
+
+ # Extract SystemDS internal timing from the same run
+ local internal_sec
+ internal_sec=$(awk '/Total execution time:/ {print $4}' "$temp_file" | tail -1 || true)
+ rm -f "$temp_file"
+ if [[ -n "$internal_sec" ]] && [[ "$internal_sec" =~ ^[0-9]+(\.[0-9]+)?$ ]]; then
+ local core_ms
+ core_ms=$(awk -v sec="$internal_sec" 'BEGIN{printf "%.1f", sec * 1000}')
+ core_times+=("$core_ms")
+ core_have=true
+ fi
+ else
+ local shell_ms
+ shell_ms=$(time_command_ms "$SYSTEMDS_CMD" "$dml" "${SYS_EXTRA_ARGS[@]}" "${NVARGS[@]}") || { cd "$original_dir"; echo "(error)|0|0|(n/a)|0|0"; return; }
+ shell_times+=("$shell_ms")
+ fi
+ done
+
+ # Return to original directory
+ cd "$original_dir"
+
+ # Calculate statistics for shell times
+ local shell_stats
+ shell_stats=$(calculate_statistics "${shell_times[@]}")
+
+ # Calculate statistics for core times if available
+ local core_stats
+ if $RUN_STATS && $core_have && [[ ${#core_times[@]} -gt 0 ]]; then
+ core_stats=$(calculate_statistics "${core_times[@]}")
+ else
+ core_stats="(n/a)|0|0"
+ fi
+
+ echo "$shell_stats|$core_stats"
+}
+
+# Run a PostgreSQL query and compute statistics
+run_psql_avg_ms() {
+ local sql_file="$1"
+
+ # Check if PostgreSQL is available
+ if [[ -z "$PSQL_EXEC" ]]; then
+ echo "(unavailable)|0|0|(n/a)|0|0"
+ return
+ fi
+
+ # Test run first
+ "$PSQL_EXEC" -U "$POSTGRES_USER" -h "$POSTGRES_HOST" -d "$POSTGRES_DB" \
+ -v ON_ERROR_STOP=1 -q \
+ -c "SET max_parallel_workers=0; SET max_parallel_maintenance_workers=0; SET max_parallel_workers_per_gather=0; SET parallel_leader_participation=off;" \
+ -f "$sql_file" >/dev/null 2>/dev/null || {
+ echo "(error)|0|0|(n/a)|0|0"
+ return
+ }
+
+ local shell_times=()
+ local core_times=()
+ local core_have=false
+
+ for ((i=1; i<=REPEATS; i++)); do
+ # Wall-clock shell time
+ local ms
+ ms=$(time_command_ms "$PSQL_EXEC" -U "$POSTGRES_USER" -h "$POSTGRES_HOST" -d "$POSTGRES_DB" \
+ -v ON_ERROR_STOP=1 -q \
+ -c "SET max_parallel_workers=0; SET max_parallel_maintenance_workers=0; SET max_parallel_workers_per_gather=0; SET parallel_leader_participation=off;" \
+ -f "$sql_file" 2>/dev/null) || {
+ echo "(error)|0|0|(n/a)|0|0"
+ return
+ }
+ shell_times+=("$ms")
+
+ # Core execution time using EXPLAIN ANALYZE (if --stats enabled)
+ if $RUN_STATS; then
+ local tmp_explain
+ tmp_explain=$(mktemp)
+
+ # Create EXPLAIN ANALYZE version of the query
+ echo "SET max_parallel_workers=0; SET max_parallel_maintenance_workers=0; SET max_parallel_workers_per_gather=0; SET parallel_leader_participation=off;" > "$tmp_explain"
+ echo "EXPLAIN (ANALYZE, BUFFERS, FORMAT TEXT)" >> "$tmp_explain"
+ cat "$sql_file" >> "$tmp_explain"
+
+ # Execute EXPLAIN ANALYZE and extract execution time
+ local explain_output core_ms
+ explain_output=$("$PSQL_EXEC" -U "$POSTGRES_USER" -h "$POSTGRES_HOST" -d "$POSTGRES_DB" \
+ -v ON_ERROR_STOP=1 -q -f "$tmp_explain" 2>/dev/null || true)
+
+ if [[ -n "$explain_output" ]]; then
+ # Extract "Execution Time: X.XXX ms" from EXPLAIN ANALYZE output
+ local exec_time_ms
+ exec_time_ms=$(echo "$explain_output" | grep -oE "Execution Time: [0-9]+\.[0-9]+" | grep -oE "[0-9]+\.[0-9]+" | head -1 || true)
+
+ if [[ -n "$exec_time_ms" ]] && [[ "$exec_time_ms" =~ ^[0-9]+(\.[0-9]+)?$ ]]; then
+ core_ms=$(awk -v ms="$exec_time_ms" 'BEGIN{printf "%.1f", ms}')
+ core_times+=("$core_ms")
+ core_have=true
+ fi
+ fi
+
+ rm -f "$tmp_explain"
+ fi
+ done
+
+ # Build outputs
+ local shell_stats core_stats
+ shell_stats=$(calculate_statistics "${shell_times[@]}")
+ if $RUN_STATS && $core_have && [[ ${#core_times[@]} -gt 0 ]]; then
+ core_stats=$(calculate_statistics "${core_times[@]}")
+ else
+ core_stats="(n/a)|0|0"
+ fi
+ echo "$shell_stats|$core_stats"
+}
+
+# Run a DuckDB query and compute statistics
+run_duckdb_avg_ms() {
+ local sql_file="$1"
+
+ # Check if DuckDB is available
+ if [[ -z "$DUCKDB_EXEC" ]]; then
+ echo "(unavailable)|0|0|(n/a)|0|0"
+ return
+ fi
+
+ # Test run with minimal setup (no profiling)
+ local tmp_test
+ tmp_test=$(mktemp)
+ printf 'PRAGMA threads=1;\n' > "$tmp_test"
+ cat "$sql_file" >> "$tmp_test"
+ "$DUCKDB_EXEC" "$DUCKDB_DB_PATH" < "$tmp_test" >/dev/null 2>&1 || {
+ rm -f "$tmp_test"
+ echo "(error)|0|0|(n/a)|0|0"
+ return
+ }
+ rm -f "$tmp_test"
+
+ local shell_times=()
+ local core_times=()
+ local core_have=false
+
+ for ((i=1; i<=REPEATS; i++)); do
+ local tmp_sql iter_json
+ tmp_sql=$(mktemp)
+ if $RUN_STATS; then
+ # Enable JSON profiling per-run and write to a temporary file
+ iter_json=$(mktemp -t duckprof.XXXXXX).json
+ cat > "$tmp_sql" < "$tmp_sql"
+ fi
+ cat "$sql_file" >> "$tmp_sql"
+
+ # Wall-clock shell time
+ local ms
+ ms=$(time_command_ms "$DUCKDB_EXEC" "$DUCKDB_DB_PATH" < "$tmp_sql") || {
+ rm -f "$tmp_sql" ${iter_json:+"$iter_json"}
+ echo "(error)|0|0|(n/a)|0|0"
+ return
+ }
+ shell_times+=("$ms")
+
+ # Parse core latency from JSON profile if available
+ if $RUN_STATS && [[ -n "${iter_json:-}" && -f "$iter_json" ]]; then
+ local core_sec
+ if command -v jq >/dev/null 2>&1; then
+ core_sec=$(jq -r '.latency // empty' "$iter_json" 2>/dev/null || true)
+ else
+ core_sec=$(grep -oE '"latency"\s*:\s*[0-9.]+' "$iter_json" 2>/dev/null | sed -E 's/.*:\s*//' | head -1 || true)
+ fi
+ if [[ -n "$core_sec" ]] && [[ "$core_sec" =~ ^[0-9]+(\.[0-9]+)?$ ]]; then
+ local core_ms
+ core_ms=$(awk -v s="$core_sec" 'BEGIN{printf "%.1f", s*1000}')
+ core_times+=("$core_ms")
+ core_have=true
+ fi
+ fi
+
+ rm -f "$tmp_sql" ${iter_json:+"$iter_json"}
+ done
+
+ # Build outputs
+ local shell_stats core_stats
+ shell_stats=$(calculate_statistics "${shell_times[@]}")
+ if $RUN_STATS && $core_have && [[ ${#core_times[@]} -gt 0 ]]; then
+ core_stats=$(calculate_statistics "${core_times[@]}")
+ else
+ core_stats="(n/a)|0|0"
+ fi
+ echo "$shell_stats|$core_stats"
+}
+
+# Help function
+show_help() {
+ cat << 'EOF'
+Multi-Engine SSB Performance Benchmark Runner v1.0
+
+USAGE (from repo root):
+ scripts/ssb/shell/run_all_perf.sh [OPTIONS] [QUERIES...]
+
+OPTIONS:
+ -stats, --stats Enable SystemDS internal statistics collection
+ -warmup=N, --warmup=N Set number of warmup runs (default: 1)
+ -repeats=N, --repeats=N Set number of timing repetitions (default: 5)
+ -seed=N, --seed=N Set random seed for reproducible results (default: auto-generated)
+ -stacked, --stacked Use stacked, multi-line layout (best for narrow terminals)
+ -layout=MODE, --layout=MODE Set layout: auto|wide|stacked (default: auto)
+ Note: --layout=stacked is equivalent to --stacked
+ --layout=wide forces wide table layout
+ -input-dir=PATH, --input-dir=PATH Specify custom data directory (default: $PROJECT_ROOT/data)
+ -output-dir=PATH, --output-dir=PATH Specify custom output directory (default: $PROJECT_ROOT/scripts/ssb/shell/ssbOutputData/PerformanceData)
+ -h, -help, --help, --h Show this help message
+ -v, -version, --version, --v Show version information
+
+QUERIES:
+ If no queries are specified, all available SSB queries (q*.dml) will be executed.
+ To run specific queries, provide their names (with or without .dml extension):
+ scripts/ssb/shell/run_all_perf.sh q1.1 q2.3 q4.1
+
+EXAMPLES (from repo root):
+ scripts/ssb/shell/run_all_perf.sh # Run full benchmark with all engines
+ scripts/ssb/shell/run_all_perf.sh --warmup=3 --repeats=10 # Custom warmup and repetition settings
+ scripts/ssb/shell/run_all_perf.sh -warmup=3 -repeats=10 # Same with single dashes
+ scripts/ssb/shell/run_all_perf.sh --stats # Enable SystemDS internal timing
+ scripts/ssb/shell/run_all_perf.sh --layout=wide # Force wide table layout
+ scripts/ssb/shell/run_all_perf.sh --stacked # Force stacked layout for narrow terminals
+ scripts/ssb/shell/run_all_perf.sh q1.1 q2.3 # Benchmark specific queries only
+ scripts/ssb/shell/run_all_perf.sh --seed=12345 # Reproducible benchmark run
+ scripts/ssb/shell/run_all_perf.sh --input-dir=/path/to/data # Custom data directory
+ scripts/ssb/shell/run_all_perf.sh -input-dir=/path/to/data # Same as above (single dash)
+ scripts/ssb/shell/run_all_perf.sh --output-dir=/tmp/results # Custom output directory
+ scripts/ssb/shell/run_all_perf.sh -output-dir=/tmp/results # Same as above (single dash)
+
+ENGINES:
+ - SystemDS: Machine learning platform with DML queries
+ - PostgreSQL: Industry-standard relational database (if available)
+ - DuckDB: High-performance analytical database (if available)
+
+OUTPUT:
+ Results are saved in CSV and JSON formats with comprehensive metadata:
+ - Performance timing statistics (mean, stdev, p95)
+ - Engine comparison and fastest detection
+ - System information and run configuration
+
+STATISTICAL OUTPUT FORMAT:
+ 1824 (±10, p95:1840)
+ │ │ └── 95th percentile (worst-case bound)
+ │ └── Standard deviation (consistency measure)
+ └── Mean execution time (typical performance)
+
+For more information, see the documentation in scripts/ssb/README.md
+EOF
+}
+
+# Parse arguments
+RUN_STATS=false
+QUERIES=()
+SEED=""
+LAYOUT="auto"
+INPUT_DIR=""
+OUTPUT_DIR=""
+
+# Support both --opt=value and --opt value forms
+EXPECT_OPT=""
+for arg in "$@"; do
+ if [[ -n "$EXPECT_OPT" ]]; then
+ case "$EXPECT_OPT" in
+ seed)
+ SEED="$arg"
+ EXPECT_OPT=""
+ continue
+ ;;
+ input-dir)
+ INPUT_DIR="$arg"
+ EXPECT_OPT=""
+ continue
+ ;;
+ output-dir)
+ OUTPUT_DIR="$arg"
+ EXPECT_OPT=""
+ continue
+ ;;
+ warmup)
+ WARMUP="$arg"
+ if ! [[ "$WARMUP" =~ ^[0-9]+$ ]] || [[ "$WARMUP" -lt 0 ]]; then
+ echo "Error: --warmup requires a non-negative integer (e.g., --warmup 2)" >&2
+ exit 1
+ fi
+ EXPECT_OPT=""
+ continue
+ ;;
+ repeats)
+ REPEATS="$arg"
+ if ! [[ "$REPEATS" =~ ^[0-9]+$ ]] || [[ "$REPEATS" -lt 1 ]]; then
+ echo "Error: --repeats requires a positive integer (e.g., --repeats 5)" >&2
+ exit 1
+ fi
+ EXPECT_OPT=""
+ continue
+ ;;
+ layout)
+ LAYOUT="$arg"
+ if [[ "$LAYOUT" != "auto" && "$LAYOUT" != "wide" && "$LAYOUT" != "stacked" ]]; then
+ echo "Error: --layout requires one of: auto, wide, stacked (e.g., --layout wide)" >&2
+ exit 1
+ fi
+ EXPECT_OPT=""
+ continue
+ ;;
+ esac
+ fi
+
+ if [[ "$arg" == "--help" || "$arg" == "-help" || "$arg" == "-h" || "$arg" == "--h" ]]; then
+ show_help
+ exit 0
+ elif [[ "$arg" == "--version" || "$arg" == "-version" || "$arg" == "-v" || "$arg" == "--v" ]]; then
+ echo "Multi-Engine SSB Performance Benchmark Runner v1.0"
+ echo "First Public Release: September 5, 2025"
+ exit 0
+ elif [[ "$arg" == "--stats" || "$arg" == "-stats" ]]; then
+ RUN_STATS=true
+ elif [[ "$arg" == --seed=* || "$arg" == -seed=* ]]; then
+ SEED="${arg#*seed=}"
+ elif [[ "$arg" == "--seed" || "$arg" == "-seed" ]]; then
+ EXPECT_OPT="seed"
+ elif [[ "$arg" == --warmup=* || "$arg" == -warmup=* ]]; then
+ WARMUP="${arg#*warmup=}"
+ if ! [[ "$WARMUP" =~ ^[0-9]+$ ]] || [[ "$WARMUP" -lt 0 ]]; then
+ echo "Error: -warmup/--warmup requires a non-negative integer (e.g., -warmup=2)" >&2
+ exit 1
+ fi
+ elif [[ "$arg" == --input-dir=* || "$arg" == -input-dir=* ]]; then
+ INPUT_DIR="${arg#*input-dir=}"
+ elif [[ "$arg" == "--input-dir" || "$arg" == "-input-dir" ]]; then
+ EXPECT_OPT="input-dir"
+ elif [[ "$arg" == --output-dir=* || "$arg" == -output-dir=* ]]; then
+ OUTPUT_DIR="${arg#*output-dir=}"
+ elif [[ "$arg" == "--output-dir" || "$arg" == "-output-dir" ]]; then
+ EXPECT_OPT="output-dir"
+ elif [[ "$arg" == "--warmup" || "$arg" == "-warmup" ]]; then
+ EXPECT_OPT="warmup"
+ elif [[ "$arg" == --repeats=* || "$arg" == -repeats=* ]]; then
+ REPEATS="${arg#*repeats=}"
+ if ! [[ "$REPEATS" =~ ^[0-9]+$ ]] || [[ "$REPEATS" -lt 1 ]]; then
+ echo "Error: -repeats/--repeats requires a positive integer (e.g., -repeats=5)" >&2
+ exit 1
+ fi
+ elif [[ "$arg" == "--repeats" || "$arg" == "-repeats" ]]; then
+ EXPECT_OPT="repeats"
+ elif [[ "$arg" == "--stacked" || "$arg" == "-stacked" ]]; then
+ LAYOUT="stacked"
+ elif [[ "$arg" == --layout=* || "$arg" == -layout=* ]]; then
+ LAYOUT="${arg#*layout=}"
+ if [[ "$LAYOUT" != "auto" && "$LAYOUT" != "wide" && "$LAYOUT" != "stacked" ]]; then
+ echo "Error: -layout/--layout requires one of: auto, wide, stacked (e.g., --layout=wide)" >&2
+ exit 1
+ fi
+ elif [[ "$arg" == "--layout" || "$arg" == "-layout" ]]; then
+ EXPECT_OPT="layout"
+ else
+ # Check if argument looks like an unrecognized option (starts with dash)
+ if [[ "$arg" == -* ]]; then
+ echo "Error: Unrecognized option '$arg'" >&2
+ echo "Use --help or -h to see available options." >&2
+ exit 1
+ else
+ # Treat as query name
+ QUERIES+=( "$(echo "$arg" | tr '.' '_')" )
+ fi
+ fi
+ done
+
+# If the last option expected a value but none was provided
+if [[ -n "$EXPECT_OPT" ]]; then
+ case "$EXPECT_OPT" in
+ seed) echo "Error: -seed/--seed requires a value (e.g., -seed=12345)" >&2 ;;
+ warmup) echo "Error: -warmup/--warmup requires a value (e.g., -warmup=2)" >&2 ;;
+ repeats) echo "Error: -repeats/--repeats requires a value (e.g., -repeats=5)" >&2 ;;
+ layout) echo "Error: -layout/--layout requires a value (e.g., -layout=wide)" >&2 ;;
+ esac
+ exit 1
+fi
+
+# Generate seed if not provided
+if [[ -z "$SEED" ]]; then
+ SEED=$((RANDOM * 32768 + RANDOM))
+fi
+if [[ ${#QUERIES[@]} -eq 0 ]]; then
+ for f in "$QUERY_DIR"/q*.dml; do
+ [[ -e "$f" ]] || continue
+ bname="$(basename "$f")"
+ QUERIES+=( "${bname%.dml}" )
+ done
+fi
+
+# Set data directory
+if [[ -z "$INPUT_DIR" ]]; then
+ INPUT_DIR="$PROJECT_ROOT/data"
+fi
+
+# Set output directory
+if [[ -z "$OUTPUT_DIR" ]]; then
+ OUTPUT_DIR="$PROJECT_ROOT/scripts/ssb/shell/ssbOutputData/PerformanceData"
+fi
+
+# Normalize paths by removing trailing slashes
+INPUT_DIR="${INPUT_DIR%/}"
+OUTPUT_DIR="${OUTPUT_DIR%/}"
+
+# Pass input directory to DML scripts via SystemDS named arguments
+NVARGS=( -nvargs "input_dir=${INPUT_DIR}" )
+
+# Validate data directory
+if [[ ! -d "$INPUT_DIR" ]]; then
+ echo "Error: Data directory '$INPUT_DIR' does not exist." >&2
+ echo "Please ensure the directory exists or specify a different path with -input-dir." >&2
+ exit 1
+fi
+
+# Ensure output directory exists
+mkdir -p "$OUTPUT_DIR"
+
+# Metadata collection functions
+collect_system_metadata() {
+ local timestamp hostname systemds_version jdk_version postgres_version duckdb_version cpu_info ram_info
+
+ # Basic system info
+ timestamp=$(date -u '+%Y-%m-%d %H:%M:%S UTC')
+ hostname=$(hostname 2>/dev/null || echo "unknown")
+
+ # SystemDS version
+ if [[ -x "$SYSTEMDS_CMD" ]]; then
+ # Try to get version from pom.xml first
+ if [[ -f "$PROJECT_ROOT/pom.xml" ]]; then
+ systemds_version=$(grep -A1 'org.apache.systemds' "$PROJECT_ROOT/pom.xml" | grep '' | sed 's/.*\(.*\)<\/version>.*/\1/' | head -1 2>/dev/null || echo "unknown")
+ else
+ systemds_version="unknown"
+ fi
+
+ # If pom.xml method failed, try alternative methods
+ if [[ "$systemds_version" == "unknown" ]]; then
+ # Try to extract from SystemDS JAR manifest
+ if [[ -f "$PROJECT_ROOT/target/systemds.jar" ]]; then
+ systemds_version=$(unzip -p "$PROJECT_ROOT/target/systemds.jar" META-INF/MANIFEST.MF 2>/dev/null | grep "Implementation-Version" | cut -d: -f2 | tr -d ' ' || echo "unknown")
+ else
+ # Try to find any SystemDS JAR and extract version
+ local jar_file=$(find "$PROJECT_ROOT" -name "systemds*.jar" | head -1 2>/dev/null)
+ if [[ -n "$jar_file" ]]; then
+ systemds_version=$(unzip -p "$jar_file" META-INF/MANIFEST.MF 2>/dev/null | grep "Implementation-Version" | cut -d: -f2 | tr -d ' ' || echo "unknown")
+ else
+ systemds_version="unknown"
+ fi
+ fi
+ fi
+ else
+ systemds_version="unknown"
+ fi
+
+ # JDK version
+ if command -v java >/dev/null 2>&1; then
+ jdk_version=$(java -version 2>&1 | grep -v "Picked up" | head -1 | sed 's/.*"\(.*\)".*/\1/' || echo "unknown")
+ else
+ jdk_version="unknown"
+ fi
+
+ # PostgreSQL version
+ if command -v psql >/dev/null 2>&1; then
+ postgres_version=$(psql --version 2>/dev/null | head -1 || echo "not available")
+ else
+ postgres_version="not available"
+ fi
+
+ # DuckDB version
+ if command -v duckdb >/dev/null 2>&1; then
+ duckdb_version=$(duckdb --version 2>/dev/null || echo "not available")
+ else
+ duckdb_version="not available"
+ fi
+
+ # System resources
+ if [[ "$(uname)" == "Darwin" ]]; then
+ # macOS
+ cpu_info=$(sysctl -n machdep.cpu.brand_string 2>/dev/null || echo "unknown")
+ ram_info=$(( $(sysctl -n hw.memsize 2>/dev/null || echo 0) / 1024 / 1024 / 1024 ))GB
+ else
+ # Linux
+ cpu_info=$(grep "model name" /proc/cpuinfo | head -1 | cut -d: -f2- | sed 's/^ *//' 2>/dev/null || echo "unknown")
+ ram_info=$(( $(grep MemTotal /proc/meminfo | awk '{print $2}' 2>/dev/null || echo 0) / 1024 / 1024 ))GB
+ fi
+
+ # Store metadata globally
+ RUN_TIMESTAMP="$timestamp"
+ RUN_HOSTNAME="$hostname"
+ RUN_SYSTEMDS_VERSION="$systemds_version"
+ RUN_JDK_VERSION="$jdk_version"
+ RUN_POSTGRES_VERSION="$postgres_version"
+ RUN_DUCKDB_VERSION="$duckdb_version"
+ RUN_CPU_INFO="$cpu_info"
+ RUN_RAM_INFO="$ram_info"
+}
+
+collect_data_metadata() {
+ # Check for SSB data directory and get basic stats
+ local ssb_data_dir="$INPUT_DIR"
+ local json_parts=()
+ local display_parts=()
+
+ if [[ -d "$ssb_data_dir" ]]; then
+ # Try to get row counts from data files (if they exist)
+ for table in customer part supplier date; do
+ local file="$ssb_data_dir/${table}.tbl"
+ if [[ -f "$file" ]]; then
+ local count=$(wc -l < "$file" 2>/dev/null | tr -d ' ' || echo "0")
+ json_parts+=(" \"$table\": \"$count\"")
+ display_parts+=("$table:$count")
+ fi
+ done
+ # Check for any lineorder*.tbl file (SSB fact table)
+ local lineorder_file=$(find "$ssb_data_dir" -name "lineorder*.tbl" -type f | head -1)
+ if [[ -n "$lineorder_file" && -f "$lineorder_file" ]]; then
+ local count=$(wc -l < "$lineorder_file" 2>/dev/null | tr -d ' ' || echo "0")
+ json_parts+=(" \"lineorder\": \"$count\"")
+ display_parts+=("lineorder:$count")
+ fi
+ fi
+
+ if [[ ${#json_parts[@]} -eq 0 ]]; then
+ RUN_DATA_INFO='"No data files found"'
+ RUN_DATA_DISPLAY="No data files found"
+ else
+ # Join array elements with commas and newlines, wrap in braces for JSON
+ local formatted_json="{\n"
+ for i in "${!json_parts[@]}"; do
+ formatted_json+="${json_parts[$i]}"
+ if [[ $i -lt $((${#json_parts[@]} - 1)) ]]; then
+ formatted_json+=",\n"
+ else
+ formatted_json+="\n"
+ fi
+ done
+ formatted_json+=" }"
+ RUN_DATA_INFO="$formatted_json"
+
+ # Join with spaces for display
+ local IFS=" "
+ RUN_DATA_DISPLAY="${display_parts[*]}"
+ fi
+}
+
+print_metadata_header() {
+ echo "=================================================================================="
+ echo " MULTI-ENGINE PERFORMANCE BENCHMARK METADATA"
+ echo "=================================================================================="
+ echo "Timestamp: $RUN_TIMESTAMP"
+ echo "Hostname: $RUN_HOSTNAME"
+ echo "Seed: $SEED"
+ echo
+ echo "Software Versions:"
+ echo " SystemDS: $RUN_SYSTEMDS_VERSION"
+ echo " JDK: $RUN_JDK_VERSION"
+ echo " PostgreSQL: $RUN_POSTGRES_VERSION"
+ echo " DuckDB: $RUN_DUCKDB_VERSION"
+ echo
+ echo "System Resources:"
+ echo " CPU: $RUN_CPU_INFO"
+ echo " RAM: $RUN_RAM_INFO"
+ echo
+ echo "Data Build Info:"
+ echo " SSB Data: $RUN_DATA_DISPLAY"
+ echo
+ echo "Run Configuration:"
+ echo " Statistics: $(if $RUN_STATS; then echo "enabled"; else echo "disabled"; fi)"
+ echo " Queries: ${#QUERIES[@]} selected"
+ echo " Warmup Runs: $WARMUP"
+ echo " Repeat Runs: $REPEATS"
+ echo "=================================================================================="
+ echo
+}
+
+# Progress indicator function
+progress_indicator() {
+ local query_name="$1"
+ local stage="$2"
+ # Use terminal width for proper clearing, fallback to 120 chars if tput fails
+ local term_width
+ term_width=$(tput cols 2>/dev/null || echo 120)
+ local spaces=$(printf "%*s" "$term_width" "")
+ echo -ne "\r$spaces\r$query_name: Running $stage..."
+}
+
+# Clear progress line function
+clear_progress() {
+ local term_width
+ term_width=$(tput cols 2>/dev/null || echo 120)
+ local spaces=$(printf "%*s" "$term_width" "")
+ echo -ne "\r$spaces\r"
+}
+
+# Main execution
+# Collect metadata
+collect_system_metadata
+collect_data_metadata
+
+# Print metadata header
+print_metadata_header
+
+verify_environment
+echo
+echo "NOTE (macOS): You cannot drop OS caches like Linux (sync; echo 3 > /proc/sys/vm/drop_caches)."
+echo "We mitigate with warm-up runs and repeated averages to ensure consistent measurements."
+echo
+echo "INTERPRETATION GUIDE:"
+echo "- SystemDS Shell (ms): Total execution time including JVM startup, I/O, and computation"
+echo "- SystemDS Core (ms): Pure computation time excluding JVM overhead (only with --stats)"
+echo "- PostgreSQL (ms): Single-threaded execution time with parallel workers disabled"
+echo "- PostgreSQL Core (ms): Query execution time from EXPLAIN ANALYZE (only with --stats)"
+echo "- DuckDB (ms): Single-threaded execution time with threads=1 pragma"
+echo "- DuckDB Core (ms): Engine-internal latency from JSON profiling (with --stats)"
+echo "- (missing): SQL file not found for this query"
+echo "- (n/a): Core timing unavailable (run with --stats flag for internal timing)"
+echo
+echo "NOTE: All engines use single-threaded execution for fair comparison."
+echo " Multiple runs with averaging provide statistical reliability."
+echo
+echo "Single-threaded execution; warm-up runs: $WARMUP, timed runs: $REPEATS"
+echo "Row 1 shows mean (ms); Row 2 shows ±stdev/CV; Row 3 shows p95 (ms)."
+echo "Core execution times available for all engines with --stats flag."
+echo
+term_width=$(tput cols 2>/dev/null || echo 120)
+if [[ "$LAYOUT" == "auto" ]]; then
+ if [[ $term_width -ge 140 ]]; then
+ LAYOUT_MODE="wide"
+ else
+ LAYOUT_MODE="stacked"
+ fi
+else
+ LAYOUT_MODE="$LAYOUT"
+fi
+
+# If the user requested wide layout but the terminal is too narrow, fall back to stacked
+if [[ "$LAYOUT_MODE" == "wide" ]]; then
+ # compute total printable width: sum(widths) + 3*cols + 1 (accounting for separators)
+ sumw=0
+ for w in "${WIDE_COL_WIDTHS[@]}"; do sumw=$((sumw + w)); done
+ cols=${#WIDE_COL_WIDTHS[@]}
+ total_width=$((sumw + 3*cols + 1))
+ if [[ $total_width -gt $term_width ]]; then
+ # Try to scale columns down proportionally to fit terminal width
+ reserved=$((3*cols + 1))
+ avail=$((term_width - reserved))
+ if [[ $avail -le 0 ]]; then
+ :
+ else
+ # Minimum sensible widths per column (keep labels readable)
+ MIN_COL_WIDTHS=(6 8 8 6 10 6 6 16)
+ # Start with proportional distribution
+ declare -a new_widths=()
+ for w in "${WIDE_COL_WIDTHS[@]}"; do
+ nw=$(( w * avail / sumw ))
+ if [[ $nw -lt 1 ]]; then nw=1; fi
+ new_widths+=("$nw")
+ done
+ # Enforce minimums
+ sum_new=0
+ for i in "${!new_widths[@]}"; do
+ if [[ ${new_widths[i]} -lt ${MIN_COL_WIDTHS[i]:-4} ]]; then
+ new_widths[i]=${MIN_COL_WIDTHS[i]:-4}
+ fi
+ sum_new=$((sum_new + new_widths[i]))
+ done
+ # If even minimums exceed available, fallback to stacked
+ if [[ $sum_new -gt $avail ]]; then
+ :
+ else
+ # Distribute remaining columns' widths left-to-right
+ rem=$((avail - sum_new))
+ i=0
+ while [[ $rem -gt 0 ]]; do
+ new_widths[i]=$((new_widths[i] + 1))
+ rem=$((rem - 1))
+ i=$(( (i + 1) % cols ))
+ done
+ # Replace WIDE_COL_WIDTHS with the scaled values for printing
+ WIDE_COL_WIDTHS=("${new_widths[@]}")
+ # Recompute total_width for logging
+ sumw=0
+ for w in "${WIDE_COL_WIDTHS[@]}"; do sumw=$((sumw + w)); done
+ total_width=$((sumw + reserved))
+ echo "Info: scaled wide layout to fit terminal ($term_width cols): table width $total_width"
+ fi
+ fi
+ fi
+fi
+
+if [[ "$LAYOUT_MODE" == "wide" ]]; then
+ grid_line_wide
+ grid_row_wide \
+ "Query" \
+ "SysDS Shell" "SysDS Core" \
+ "PostgreSQL" "PostgreSQL Core" \
+ "DuckDB" "DuckDB Core" \
+ "Fastest"
+ grid_row_wide "" "mean" "mean" "mean" "mean" "mean" "mean" ""
+ grid_row_wide "" "±/CV" "±/CV" "±/CV" "±/CV" "±/CV" "±/CV" ""
+ grid_row_wide "" "p95" "p95" "p95" "p95" "p95" "p95" ""
+ grid_line_wide
+else
+ echo "================================================================================"
+ echo "Stacked layout (use --layout=wide for table view)."
+ echo "Row 1 shows mean (ms); Row 2 shows (±stdev/CV, p95)."
+ echo "--------------------------------------------------------------------------------"
+fi
+# Prepare output file paths and write CSV header with comprehensive metadata
+# Ensure results directory exists and create timestamped filenames
+RESULT_DIR="$OUTPUT_DIR"
+mkdir -p "$RESULT_DIR"
+RESULT_BASENAME="ssb_results_$(date -u +%Y%m%dT%H%M%SZ)"
+RESULT_CSV="$RESULT_DIR/${RESULT_BASENAME}.csv"
+RESULT_JSON="$RESULT_DIR/${RESULT_BASENAME}.json"
+
+{
+ echo "# Multi-Engine Performance Benchmark Results"
+ echo "# Timestamp: $RUN_TIMESTAMP"
+ echo "# Hostname: $RUN_HOSTNAME"
+ echo "# Seed: $SEED"
+ echo "# SystemDS: $RUN_SYSTEMDS_VERSION"
+ echo "# JDK: $RUN_JDK_VERSION"
+ echo "# PostgreSQL: $RUN_POSTGRES_VERSION"
+ echo "# DuckDB: $RUN_DUCKDB_VERSION"
+ echo "# CPU: $RUN_CPU_INFO"
+ echo "# RAM: $RUN_RAM_INFO"
+ echo "# Data: $RUN_DATA_DISPLAY"
+ echo "# Warmup: $WARMUP, Repeats: $REPEATS"
+ echo "# Statistics: $(if $RUN_STATS; then echo "enabled"; else echo "disabled"; fi)"
+ echo "#"
+ echo "query,systemds_shell_display,systemds_shell_mean,systemds_shell_stdev,systemds_shell_p95,systemds_core_display,systemds_core_mean,systemds_core_stdev,systemds_core_p95,postgres_display,postgres_mean,postgres_stdev,postgres_p95,postgres_core_display,postgres_core_mean,postgres_core_stdev,postgres_core_p95,duckdb_display,duckdb_mean,duckdb_stdev,duckdb_p95,duckdb_core_display,duckdb_core_mean,duckdb_core_stdev,duckdb_core_p95,fastest"
+} > "$RESULT_CSV"
+for base in "${QUERIES[@]}"; do
+ # Show progress indicator for SystemDS
+ progress_indicator "$base" "SystemDS"
+
+ dml_path="$QUERY_DIR/${base}.dml"
+ # Parse SystemDS results: shell_mean|shell_stdev|shell_p95|core_mean|core_stdev|core_p95
+ # Capture potential SystemDS test-run error messages for JSON reporting
+ tmp_err_msg=$(mktemp)
+ systemds_result="$(run_systemds_avg "$dml_path" "$tmp_err_msg")"
+ # Read any captured error message
+ sysds_err_text="$(sed -n '1,200p' "$tmp_err_msg" 2>/dev/null | tr '\n' ' ' || true)"
+ rm -f "$tmp_err_msg"
+ IFS='|' read -r sd_shell_mean sd_shell_stdev sd_shell_p95 sd_core_mean sd_core_stdev sd_core_p95 <<< "$systemds_result"
+
+ # Format SystemDS results for display
+ if [[ "$sd_shell_mean" =~ ^[0-9]+(\.[0-9]+)?$ ]]; then
+ sd_shell_display=$(format_statistics "$sd_shell_mean" "$sd_shell_stdev" "$sd_shell_p95" "$REPEATS")
+ else
+ sd_shell_display="$sd_shell_mean"
+ sd_shell_stdev="0"
+ sd_shell_p95="0"
+ fi
+ if [[ "$sd_core_mean" == "(n/a)" ]]; then
+ sd_core_display="(n/a)"
+ else
+ sd_core_display=$(format_statistics "$sd_core_mean" "$sd_core_stdev" "$sd_core_p95" "$REPEATS")
+ fi
+
+ sql_name="${base//_/.}.sql"
+ sql_path="$SQL_DIR/$sql_name"
+ pg_display="(missing)"
+ duck_display="(missing)"
+
+ if [[ -n "$PSQL_EXEC" && -f "$sql_path" ]]; then
+ progress_indicator "$base" "PostgreSQL"
+ pg_result="$(run_psql_avg_ms "$sql_path")"
+ IFS='|' read -r pg_mean pg_stdev pg_p95 pg_core_mean pg_core_stdev pg_core_p95 <<< "$pg_result"
+ if [[ "$pg_mean" == "(unavailable)" || "$pg_mean" == "(error)" ]]; then
+ pg_display="$pg_mean"
+ pg_core_display="$pg_mean"
+ pg_stdev="0"
+ pg_p95="0"
+ pg_core_mean="(n/a)"
+ pg_core_stdev="0"
+ pg_core_p95="0"
+ else
+ pg_display=$(format_statistics "$pg_mean" "$pg_stdev" "$pg_p95" "$REPEATS")
+ if [[ "$pg_core_mean" != "(n/a)" ]]; then
+ pg_core_display=$(format_statistics "$pg_core_mean" "$pg_core_stdev" "$pg_core_p95" "$REPEATS")
+ else
+ pg_core_display="(n/a)"
+ fi
+ fi
+ elif [[ -z "$PSQL_EXEC" ]]; then
+ pg_display="(unavailable)"
+ pg_core_display="(unavailable)"
+ pg_mean="(unavailable)"
+ pg_core_mean="(unavailable)"
+ pg_stdev="0"
+ pg_p95="0"
+ pg_core_stdev="0"
+ pg_core_p95="0"
+ else
+ pg_display="(missing)"
+ pg_core_display="(missing)"
+ pg_mean="(missing)"
+ pg_core_mean="(missing)"
+ pg_stdev="0"
+ pg_p95="0"
+ pg_core_stdev="0"
+ pg_core_p95="0"
+ fi
+
+ if [[ -n "$DUCKDB_EXEC" && -f "$sql_path" ]]; then
+ progress_indicator "$base" "DuckDB"
+ duck_result="$(run_duckdb_avg_ms "$sql_path")"
+ IFS='|' read -r duck_mean duck_stdev duck_p95 duck_core_mean duck_core_stdev duck_core_p95 <<< "$duck_result"
+ if [[ "$duck_mean" == "(unavailable)" || "$duck_mean" == "(error)" ]]; then
+ duck_display="$duck_mean"
+ duck_stdev="0"
+ duck_p95="0"
+ duck_core_display="(n/a)"
+ duck_core_mean="(n/a)"
+ duck_core_stdev="0"
+ duck_core_p95="0"
+ else
+ duck_display=$(format_statistics "$duck_mean" "$duck_stdev" "$duck_p95" "$REPEATS")
+ if [[ "$duck_core_mean" == "(n/a)" ]]; then
+ duck_core_display="(n/a)"
+ else
+ duck_core_display=$(format_statistics "$duck_core_mean" "$duck_core_stdev" "$duck_core_p95" "$REPEATS")
+ fi
+ fi
+ elif [[ -z "$DUCKDB_EXEC" ]]; then
+ duck_display="(unavailable)"
+ duck_mean="(unavailable)"
+ duck_stdev="0"
+ duck_p95="0"
+ duck_core_display="(unavailable)"
+ duck_core_mean="(unavailable)"
+ duck_core_stdev="0"
+ duck_core_p95="0"
+ else
+ duck_display="(missing)"
+ duck_mean="(missing)"
+ duck_stdev="0"
+ duck_p95="0"
+ duck_core_display="(missing)"
+ duck_core_mean="(missing)"
+ duck_core_stdev="0"
+ duck_core_p95="0"
+ fi
+
+ # Determine fastest engine based on mean values
+ fastest=""
+ min_ms=999999999
+ for engine in systemds pg duck; do
+ val=""
+ eng_name=""
+ case "$engine" in
+ systemds) val="$sd_shell_mean"; eng_name="SystemDS";;
+ pg) val="$pg_mean"; eng_name="PostgreSQL";;
+ duck) val="$duck_mean"; eng_name="DuckDB";;
+ esac
+ # Check if value is a valid number (including decimal)
+ if [[ "$val" =~ ^[0-9]+(\.[0-9]+)?$ ]]; then
+ # Use awk for floating point comparison
+ if [[ $(awk -v val="$val" -v min="$min_ms" 'BEGIN{print (val < min)}') -eq 1 ]]; then
+ min_ms=$(awk -v val="$val" 'BEGIN{printf "%.1f", val}')
+ fastest="$eng_name"
+ elif [[ $(awk -v val="$val" -v min="$min_ms" 'BEGIN{print (val == min)}') -eq 1 ]] && [[ -n "$fastest" ]]; then
+ fastest="$fastest+$eng_name" # Show ties
+ fi
+ fi
+ done
+ [[ -z "$fastest" ]] && fastest="(n/a)"
+
+ # Determine SystemDS per-query status and include any error message captured
+ systemds_status="success"
+ systemds_error_message=null
+ if [[ "$sd_shell_mean" == "(error)" ]] || [[ -n "$sysds_err_text" ]]; then
+ systemds_status="error"
+ if [[ -n "$sysds_err_text" ]]; then
+ # Escape quotes for JSON embedding
+ esc=$(printf '%s' "$sysds_err_text" | sed -e 's/"/\\"/g')
+ systemds_error_message="\"$esc\""
+ else
+ systemds_error_message="\"SystemDS reported an error during test-run\""
+ fi
+ fi
+
+ # Prepare mean-only and stats-only cells
+ # Means: use numeric mean when available; otherwise use existing display label (unavailable/missing)
+ sd_shell_mean_cell=$([[ "$sd_shell_mean" =~ ^[0-9]+(\.[0-9]+)?$ ]] && echo "$sd_shell_mean" || echo "$sd_shell_display")
+ sd_core_mean_cell=$([[ "$sd_core_mean" =~ ^[0-9]+(\.[0-9]+)?$ ]] && echo "$sd_core_mean" || echo "$sd_core_display")
+ pg_mean_cell=$([[ "$pg_mean" =~ ^[0-9]+(\.[0-9]+)?$ ]] && echo "$pg_mean" || echo "$pg_display")
+ pg_core_mean_cell=$([[ "$pg_core_mean" =~ ^[0-9]+(\.[0-9]+)?$ ]] && echo "$pg_core_mean" || echo "$pg_core_display")
+ duck_mean_cell=$([[ "$duck_mean" =~ ^[0-9]+(\.[0-9]+)?$ ]] && echo "$duck_mean" || echo "$duck_display")
+ duck_core_mean_cell=$([[ "$duck_core_mean" =~ ^[0-9]+(\.[0-9]+)?$ ]] && echo "$duck_core_mean" || echo "$duck_core_display")
+
+ # Stats lines split: CV and p95
+ sd_shell_cv_cell=$(format_cv_only "$sd_shell_mean" "$sd_shell_stdev" "$REPEATS")
+ sd_core_cv_cell=$(format_cv_only "$sd_core_mean" "$sd_core_stdev" "$REPEATS")
+ pg_cv_cell=$(format_cv_only "$pg_mean" "$pg_stdev" "$REPEATS")
+ pg_core_cv_cell=$(format_cv_only "$pg_core_mean" "$pg_core_stdev" "$REPEATS")
+ duck_cv_cell=$(format_cv_only "$duck_mean" "$duck_stdev" "$REPEATS")
+ duck_core_cv_cell=$(format_cv_only "$duck_core_mean" "$duck_core_stdev" "$REPEATS")
+
+ sd_shell_p95_cell=$(format_p95_only "$sd_shell_p95" "$REPEATS")
+ sd_core_p95_cell=$(format_p95_only "$sd_core_p95" "$REPEATS")
+ pg_p95_cell=$(format_p95_only "$pg_p95" "$REPEATS")
+ pg_core_p95_cell=$(format_p95_only "$pg_core_p95" "$REPEATS")
+ duck_p95_cell=$(format_p95_only "$duck_p95" "$REPEATS")
+ duck_core_p95_cell=$(format_p95_only "$duck_core_p95" "$REPEATS")
+
+ # Clear progress line and display final results
+ clear_progress
+ if [[ "$LAYOUT_MODE" == "wide" ]]; then
+ # Three-line table style with grid separators
+ grid_row_wide \
+ "$base" \
+ "$sd_shell_mean_cell" "$sd_core_mean_cell" \
+ "$pg_mean_cell" "$pg_core_mean_cell" \
+ "$duck_mean_cell" "$duck_core_mean_cell" \
+ "$fastest"
+ grid_row_wide \
+ "" \
+ "$sd_shell_cv_cell" "$sd_core_cv_cell" \
+ "$pg_cv_cell" "$pg_core_cv_cell" \
+ "$duck_cv_cell" "$duck_core_cv_cell" \
+ ""
+ grid_row_wide \
+ "" \
+ "$sd_shell_p95_cell" "$sd_core_p95_cell" \
+ "$pg_p95_cell" "$pg_core_p95_cell" \
+ "$duck_p95_cell" "$duck_core_p95_cell" \
+ ""
+ grid_line_wide
+ else
+ # Stacked layout for narrow terminals
+ echo "Query : $base Fastest: $fastest"
+ printf ' %-20s %s\n' "SystemDS Shell:" "$sd_shell_mean_cell"
+ [[ -n "$sd_shell_cv_cell" ]] && printf ' %-20s %s\n' "" "$sd_shell_cv_cell"
+ [[ -n "$sd_shell_p95_cell" ]] && printf ' %-20s %s\n' "" "$sd_shell_p95_cell"
+ printf ' %-20s %s\n' "SystemDS Core:" "$sd_core_mean_cell"
+ [[ -n "$sd_core_cv_cell" ]] && printf ' %-20s %s\n' "" "$sd_core_cv_cell"
+ [[ -n "$sd_core_p95_cell" ]] && printf ' %-20s %s\n' "" "$sd_core_p95_cell"
+ printf ' %-20s %s\n' "PostgreSQL:" "$pg_mean_cell"
+ [[ -n "$pg_cv_cell" ]] && printf ' %-20s %s\n' "" "$pg_cv_cell"
+ [[ -n "$pg_p95_cell" ]] && printf ' %-20s %s\n' "" "$pg_p95_cell"
+ printf ' %-20s %s\n' "PostgreSQL Core:" "$pg_core_mean_cell"
+ [[ -n "$pg_core_cv_cell" ]] && printf ' %-20s %s\n' "" "$pg_core_cv_cell"
+ [[ -n "$pg_core_p95_cell" ]] && printf ' %-20s %s\n' "" "$pg_core_p95_cell"
+ printf ' %-20s %s\n' "DuckDB:" "$duck_mean_cell"
+ [[ -n "$duck_cv_cell" ]] && printf ' %-20s %s\n' "" "$duck_cv_cell"
+ [[ -n "$duck_p95_cell" ]] && printf ' %-20s %s\n' "" "$duck_p95_cell"
+ printf ' %-20s %s\n' "DuckDB Core:" "$duck_core_mean_cell"
+ [[ -n "$duck_core_cv_cell" ]] && printf ' %-20s %s\n' "" "$duck_core_cv_cell"
+ [[ -n "$duck_core_p95_cell" ]] && printf ' %-20s %s\n' "" "$duck_core_p95_cell"
+ echo "--------------------------------------------------------------------------------"
+ fi
+
+ # Write comprehensive data to CSV
+ echo "$base,\"$sd_shell_display\",$sd_shell_mean,$sd_shell_stdev,$sd_shell_p95,\"$sd_core_display\",$sd_core_mean,$sd_core_stdev,$sd_core_p95,\"$pg_display\",$pg_mean,$pg_stdev,$pg_p95,\"$pg_core_display\",$pg_core_mean,$pg_core_stdev,$pg_core_p95,\"$duck_display\",$duck_mean,$duck_stdev,$duck_p95,\"$duck_core_display\",$duck_core_mean,$duck_core_stdev,$duck_core_p95,$fastest" >> "$RESULT_CSV"
+
+ # Build JSON entry for this query
+ json_entry=$(cat < "$RESULT_JSON"
+
+echo "Results saved to $RESULT_CSV"
+echo "Results saved to $RESULT_JSON"
diff --git a/scripts/ssb/shell/run_ssb.sh b/scripts/ssb/shell/run_ssb.sh
new file mode 100755
index 00000000000..e15e2159a23
--- /dev/null
+++ b/scripts/ssb/shell/run_ssb.sh
@@ -0,0 +1,856 @@
+#!/usr/bin/env bash
+#
+# SystemDS Star Schema Benchmark (SSB) Runner
+# ===========================================
+#
+# CORE SCRIPTS STATUS:
+# - Version: 1.0 (September 5, 2025)
+# - Status: Production-Ready with Advanced User Experience
+# - First Public Release: September 5, 2025
+#
+# FEATURES IMPLEMENTED:
+# ✓ Basic SSB query execution with SystemDS 3.4.0-SNAPSHOT
+# ✓ Single-threaded configuration for consistent benchmarking
+# ✓ Progress indicators with real-time updates
+# ✓ Comprehensive timing measurements using /usr/bin/time
+# ✓ Query result extraction (scalar and table formats)
+# ✓ Success/failure tracking with detailed reporting
+# ✓ Query summary table with execution status
+# ✓ "See below" notation with result reprinting (NEW)
+# ✓ Long table outputs displayed after summary (NEW)
+# ✓ Error handling with timeout protection
+# ✓ Cross-platform compatibility (macOS/Linux)
+#
+# RECENT IMPORTANT ADDITIONS:
+# - Accepts --input-dir=PATH and forwards it into DML runs as a SystemDS named
+# argument: -nvargs input_dir=/path/to/data (DML can use sys.vinput_dir or
+# the named argument to locate data files instead of hardcoded `data/`).
+# - Fast-fail on missing input directory: the runner verifies the provided
+# input path exists and exits with a clear error message if not.
+# - Runtime SystemDS error detection: test-run output is scanned for runtime
+# error blocks (e.g., "An Error Occurred : ..."). Queries with runtime
+# failures are reported as `status: "error"` and include `error_message`
+# in generated JSON metadata for easier debugging and CI integration.
+#
+# MAJOR FEATURES IN v1.0 (First Public Release):
+# - Complete SSB query execution with SystemDS 3.4.0-SNAPSHOT
+# - Enhanced "see below" notation with result reprinting
+# - Long table outputs displayed after summary for better UX
+# - Eliminated need to scroll back through terminal output
+# - Maintained array alignment for consistent result tracking
+# - JSON metadata contains complete query results, not "see below"
+# - Added --out-dir option for custom output directory
+# - Multi-format output: TXT, CSV, JSON for each query result
+# - Structured output directory with comprehensive run.json metadata file
+#
+# DEPENDENCIES:
+# - SystemDS binary (3.4.0-SNAPSHOT or later)
+# - Single-threaded configuration file (auto-generated)
+# - SSB query files in scripts/ssb/queries/
+# - Bash 4.0+ with timeout support
+#
+# USAGE (from repo root):
+# scripts/ssb/shell/run_ssb.sh # run all SSB queries
+# scripts/ssb/shell/run_ssb.sh q1.1 q2.3 # run specific queries
+# scripts/ssb/shell/run_ssb.sh --stats # enable internal statistics
+# scripts/ssb/shell/run_ssb.sh q3.1 --stats # run specific query with stats
+# scripts/ssb/shell/run_ssb.sh --seed=12345 # run with specific seed for reproducibility
+# scripts/ssb/shell/run_ssb.sh --out-dir=/path # specify output directory for results
+#
+set -euo pipefail
+export LC_ALL=C
+
+# Determine script directory and project root (repo root)
+if command -v realpath >/dev/null 2>&1; then
+ SCRIPT_DIR="$(dirname "$(realpath "$0")")"
+else
+ SCRIPT_DIR="$(python - <<'PY'
+import os, sys
+print(os.path.dirname(os.path.abspath(sys.argv[1])))
+PY
+"$0")"
+fi
+if command -v git >/dev/null 2>&1 && git -C "$SCRIPT_DIR" rev-parse --show-toplevel >/dev/null 2>&1; then
+ PROJECT_ROOT="$(git -C "$SCRIPT_DIR" rev-parse --show-toplevel)"
+else
+ __dir="$SCRIPT_DIR"
+ PROJECT_ROOT=""
+ while [[ "$__dir" != "/" ]]; do
+ if [[ -d "$__dir/.git" || -f "$__dir/pom.xml" ]]; then
+ PROJECT_ROOT="$__dir"; break
+ fi
+ __dir="$(dirname "$__dir")"
+ done
+ : "${PROJECT_ROOT:=$(cd "$SCRIPT_DIR/../../../" && pwd)}"
+fi
+
+# Locate SystemDS executable
+SYSTEMDS_CMD="$PROJECT_ROOT/bin/systemds"
+if [[ ! -x "$SYSTEMDS_CMD" ]]; then
+ SYSTEMDS_CMD="$(command -v systemds || true)"
+fi
+if [[ -z "$SYSTEMDS_CMD" || ! -x "$SYSTEMDS_CMD" ]]; then
+ echo "Error: could not find SystemDS executable." >&2
+ echo " Tried: $PROJECT_ROOT/bin/systemds and PATH" >&2
+ exit 1
+fi
+
+# Ensure single-threaded configuration file exists
+CONF_DIR="$PROJECT_ROOT/conf"
+SINGLE_THREAD_CONF="$CONF_DIR/single_thread.xml"
+mkdir -p "$CONF_DIR"
+if [[ ! -f "$SINGLE_THREAD_CONF" ]]; then
+cat > "$SINGLE_THREAD_CONF" <<'XML'
+
+
+ sysds.cp.parallel.opsfalse
+
+
+ sysds.num.threads1
+
+
+XML
+fi
+SYS_EXTRA_ARGS=( "-config" "$SINGLE_THREAD_CONF" )
+
+# Query directory
+QUERY_DIR="$PROJECT_ROOT/scripts/ssb/queries"
+
+# Verify query directory exists
+if [[ ! -d "$QUERY_DIR" ]]; then
+ echo "Error: Query directory not found: $QUERY_DIR" >&2
+ exit 1
+fi
+
+# Help function
+show_help() {
+ cat << 'EOF'
+SystemDS Star Schema Benchmark (SSB) Runner v1.0
+
+USAGE (from repo root):
+ scripts/ssb/shell/run_ssb.sh [OPTIONS] [QUERIES...]
+
+OPTIONS:
+ --stats, -stats Enable SystemDS internal statistics collection
+ --seed=N, -seed=N Set random seed for reproducible results (default: auto-generated)
+ --output-dir=PATH, -output-dir=PATH Specify custom output directory (default: $PROJECT_ROOT/scripts/ssb/shell/ssbOutputData/QueryData)
+ --input-dir=PATH, -input-dir=PATH Specify custom data directory (default: $PROJECT_ROOT/data)
+ --help, -help, -h, --h Show this help message
+ --version, -version, -v, --v Show version information
+
+QUERIES:
+ If no queries are specified, all available SSB queries (q*.dml) will be executed.
+ To run specific queries, provide their names (with or without .dml extension):
+ ./run_ssb.sh q1.1 q2.3 q4.1
+
+EXAMPLES (from repo root):
+ scripts/ssb/shell/run_ssb.sh # Run all SSB queries
+ scripts/ssb/shell/run_ssb.sh --stats # Run all queries with statistics
+ scripts/ssb/shell/run_ssb.sh -stats # Same as above (single dash)
+ scripts/ssb/shell/run_ssb.sh q1.1 q2.3 # Run specific queries only
+ scripts/ssb/shell/run_ssb.sh --seed=12345 --stats # Reproducible run with statistics
+ scripts/ssb/shell/run_ssb.sh -seed=12345 -stats # Same as above (single dash)
+ scripts/ssb/shell/run_ssb.sh --output-dir=/tmp/results # Custom output directory
+ scripts/ssb/shell/run_ssb.sh -output-dir=/tmp/results # Same as above (single dash)
+ scripts/ssb/shell/run_ssb.sh --input-dir=/path/to/data # Custom data directory
+ scripts/ssb/shell/run_ssb.sh -input-dir=/path/to/data # Same as above (single dash)
+
+OUTPUT:
+ Results are saved in multiple formats:
+ - TXT: Human-readable format
+ - CSV: Machine-readable data format
+ - JSON: Structured format with metadata
+ - run.json: Complete run metadata and results
+
+For more information, see the documentation in scripts/ssb/README.md
+EOF
+}
+
+# Parse arguments
+RUN_STATS=false
+QUERIES=()
+SEED=""
+OUT_DIR=""
+INPUT_DIR=""
+for arg in "$@"; do
+ if [[ "$arg" == "--help" || "$arg" == "-help" || "$arg" == "-h" || "$arg" == "--h" ]]; then
+ show_help
+ exit 0
+ elif [[ "$arg" == "--version" || "$arg" == "-version" || "$arg" == "-v" || "$arg" == "--v" ]]; then
+ echo "SystemDS Star Schema Benchmark (SSB) Runner v1.0"
+ echo "First Public Release: September 5, 2025"
+ exit 0
+ elif [[ "$arg" == "--stats" || "$arg" == "-stats" ]]; then
+ RUN_STATS=true
+ elif [[ "$arg" == --seed=* || "$arg" == -seed=* ]]; then
+ if [[ "$arg" == --seed=* ]]; then
+ SEED="${arg#--seed=}"
+ else
+ SEED="${arg#-seed=}"
+ fi
+ elif [[ "$arg" == "--seed" || "$arg" == "-seed" ]]; then
+ echo "Error: --seed/-seed requires a value (e.g., --seed=12345 or -seed=12345)" >&2
+ exit 1
+ elif [[ "$arg" == --output-dir=* || "$arg" == -output-dir=* ]]; then
+ if [[ "$arg" == --output-dir=* ]]; then
+ OUT_DIR="${arg#--output-dir=}"
+ else
+ OUT_DIR="${arg#-output-dir=}"
+ fi
+ elif [[ "$arg" == "--output-dir" || "$arg" == "-output-dir" ]]; then
+ echo "Error: --output-dir/-output-dir requires a value (e.g., --output-dir=/path/to/output or -output-dir=/path/to/output)" >&2
+ exit 1
+ elif [[ "$arg" == --input-dir=* || "$arg" == -input-dir=* ]]; then
+ if [[ "$arg" == --input-dir=* ]]; then
+ INPUT_DIR="${arg#--input-dir=}"
+ else
+ INPUT_DIR="${arg#-input-dir=}"
+ fi
+ elif [[ "$arg" == "--input-dir" || "$arg" == "-input-dir" ]]; then
+ echo "Error: --input-dir/-input-dir requires a value (e.g., --input-dir=/path/to/data or -input-dir=/path/to/data)" >&2
+ exit 1
+ else
+ # Check if argument looks like an unrecognized option (starts with dash)
+ if [[ "$arg" == -* ]]; then
+ echo "Error: Unrecognized option '$arg'" >&2
+ echo "Use --help or -h to see available options." >&2
+ exit 1
+ else
+ # Treat as query name
+ name="$(echo "$arg" | tr '.' '_')"
+ QUERIES+=( "$name.dml" )
+ fi
+ fi
+done
+
+# Set default output directory if not provided
+if [[ -z "$OUT_DIR" ]]; then
+ OUT_DIR="$PROJECT_ROOT/scripts/ssb/shell/ssbOutputData/QueryData"
+fi
+
+# Set default input data directory if not provided
+if [[ -z "$INPUT_DIR" ]]; then
+ INPUT_DIR="$PROJECT_ROOT/data"
+fi
+
+# Normalize paths by removing trailing slashes
+INPUT_DIR="${INPUT_DIR%/}"
+OUT_DIR="${OUT_DIR%/}"
+
+# Ensure output directory exists
+mkdir -p "$OUT_DIR"
+
+# Pass input directory to DML scripts via SystemDS named arguments
+NVARGS=( -nvargs "input_dir=${INPUT_DIR}" )
+
+# Validate input data directory exists
+if [[ ! -d "$INPUT_DIR" ]]; then
+ echo "Error: Input data directory '$INPUT_DIR' does not exist." >&2
+ echo "Please create the directory or specify a valid path with --input-dir=PATH" >&2
+ exit 1
+fi
+
+# Generate seed if not provided
+if [[ -z "$SEED" ]]; then
+ SEED=$((RANDOM * 32768 + RANDOM))
+fi
+
+# Discover queries if none provided
+shopt -s nullglob
+if [[ ${#QUERIES[@]} -eq 0 ]]; then
+ for f in "$QUERY_DIR"/q*.dml; do
+ if [[ -f "$f" ]]; then
+ QUERIES+=("$(basename "$f")")
+ fi
+ done
+ if [[ ${#QUERIES[@]} -eq 0 ]]; then
+ echo "Error: No query files found in $QUERY_DIR" >&2
+ exit 1
+ fi
+fi
+shopt -u nullglob
+
+# Metadata collection functions
+collect_system_metadata() {
+ local timestamp hostname systemds_version jdk_version cpu_info ram_info
+
+ # Basic system info
+ timestamp=$(date -u '+%Y-%m-%d %H:%M:%S UTC')
+ hostname=$(hostname 2>/dev/null || echo "unknown")
+
+ # SystemDS version
+ if [[ -x "$SYSTEMDS_CMD" ]]; then
+ # Try to get version from pom.xml first
+ if [[ -f "$PROJECT_ROOT/pom.xml" ]]; then
+ systemds_version=$(grep -A1 'org.apache.systemds' "$PROJECT_ROOT/pom.xml" | grep '' | sed 's/.*\(.*\)<\/version>.*/\1/' | head -1 2>/dev/null || echo "unknown")
+ else
+ systemds_version="unknown"
+ fi
+
+ # If pom.xml method failed, try alternative methods
+ if [[ "$systemds_version" == "unknown" ]]; then
+ # Try to extract from SystemDS JAR manifest
+ if [[ -f "$PROJECT_ROOT/target/systemds.jar" ]]; then
+ systemds_version=$(unzip -p "$PROJECT_ROOT/target/systemds.jar" META-INF/MANIFEST.MF 2>/dev/null | grep "Implementation-Version" | cut -d: -f2 | tr -d ' ' || echo "unknown")
+ else
+ # Try to find any SystemDS JAR and extract version
+ local jar_file=$(find "$PROJECT_ROOT" -name "systemds*.jar" | head -1 2>/dev/null)
+ if [[ -n "$jar_file" ]]; then
+ systemds_version=$(unzip -p "$jar_file" META-INF/MANIFEST.MF 2>/dev/null | grep "Implementation-Version" | cut -d: -f2 | tr -d ' ' || echo "unknown")
+ else
+ systemds_version="unknown"
+ fi
+ fi
+ fi
+ else
+ systemds_version="unknown"
+ fi
+
+ # JDK version
+ if command -v java >/dev/null 2>&1; then
+ jdk_version=$(java -version 2>&1 | head -1 | sed 's/.*"\(.*\)".*/\1/' || echo "unknown")
+ else
+ jdk_version="unknown"
+ fi
+
+ # System resources
+ if [[ "$(uname)" == "Darwin" ]]; then
+ # macOS
+ cpu_info=$(sysctl -n machdep.cpu.brand_string 2>/dev/null || echo "unknown")
+ ram_info=$(( $(sysctl -n hw.memsize 2>/dev/null || echo 0) / 1024 / 1024 / 1024 ))GB
+ else
+ # Linux
+ cpu_info=$(grep "model name" /proc/cpuinfo | head -1 | cut -d: -f2- | sed 's/^ *//' 2>/dev/null || echo "unknown")
+ ram_info=$(( $(grep MemTotal /proc/meminfo | awk '{print $2}' 2>/dev/null || echo 0) / 1024 / 1024 ))GB
+ fi
+
+ # Store metadata globally
+ RUN_TIMESTAMP="$timestamp"
+ RUN_HOSTNAME="$hostname"
+ RUN_SYSTEMDS_VERSION="$systemds_version"
+ RUN_JDK_VERSION="$jdk_version"
+ RUN_CPU_INFO="$cpu_info"
+ RUN_RAM_INFO="$ram_info"
+}
+
+collect_data_metadata() {
+ # Check for SSB data directory and get basic stats
+ local ssb_data_dir="$INPUT_DIR"
+ local json_parts=()
+ local display_parts=()
+
+ if [[ -d "$ssb_data_dir" ]]; then
+ # Try to get row counts from data files (if they exist)
+ for table in customer part supplier date; do
+ local file="$ssb_data_dir/${table}.tbl"
+ if [[ -f "$file" ]]; then
+ local count=$(wc -l < "$file" 2>/dev/null | tr -d ' ' || echo "0")
+ json_parts+=(" \"$table\": \"$count\"")
+ display_parts+=("$table:$count")
+ fi
+ done
+ # Check for any lineorder*.tbl file (SSB fact table)
+ local lineorder_file=$(find "$ssb_data_dir" -name "lineorder*.tbl" -type f | head -1)
+ if [[ -n "$lineorder_file" && -f "$lineorder_file" ]]; then
+ local count=$(wc -l < "$lineorder_file" 2>/dev/null | tr -d ' ' || echo "0")
+ json_parts+=(" \"lineorder\": \"$count\"")
+ display_parts+=("lineorder:$count")
+ fi
+ fi
+
+ if [[ ${#json_parts[@]} -eq 0 ]]; then
+ RUN_DATA_INFO='"No data files found"'
+ RUN_DATA_DISPLAY="No data files found"
+ else
+ # Join array elements with commas and newlines, wrap in braces for JSON
+ local formatted_json="{\n"
+ for i in "${!json_parts[@]}"; do
+ formatted_json+="${json_parts[$i]}"
+ if [[ $i -lt $((${#json_parts[@]} - 1)) ]]; then
+ formatted_json+=",\n"
+ else
+ formatted_json+="\n"
+ fi
+ done
+ formatted_json+=" }"
+ RUN_DATA_INFO="$formatted_json"
+
+ # Join with spaces for display
+ local IFS=" "
+ RUN_DATA_DISPLAY="${display_parts[*]}"
+ fi
+}
+
+# Output format functions
+create_output_structure() {
+ local run_id="$1"
+ local base_dir="$OUT_DIR/ssb_run_$run_id"
+
+ # Create output directory structure
+ mkdir -p "$base_dir"/{txt,csv,json}
+
+ # Set global variables for output paths
+ OUTPUT_BASE_DIR="$base_dir"
+ OUTPUT_TXT_DIR="$base_dir/txt"
+ OUTPUT_CSV_DIR="$base_dir/csv"
+ OUTPUT_JSON_DIR="$base_dir/json"
+ OUTPUT_METADATA_FILE="$base_dir/run.json"
+}
+
+save_query_result_txt() {
+ local query_name="$1"
+ local result_data="$2"
+ local output_file="$OUTPUT_TXT_DIR/${query_name}.txt"
+
+ {
+ echo "========================================="
+ echo "SSB Query: $query_name"
+ echo "========================================="
+ echo "Timestamp: $(date -u '+%Y-%m-%d %H:%M:%S UTC')"
+ echo "Seed: $SEED"
+ echo ""
+ echo "Result:"
+ echo "---------"
+ echo "$result_data"
+ echo ""
+ echo "========================================="
+ } > "$output_file"
+}
+
+save_query_result_csv() {
+ local query_name="$1"
+ local result_data="$2"
+ local output_file="$OUTPUT_CSV_DIR/${query_name}.csv"
+
+ # Check if result is a single scalar value (including negative numbers and scientific notation)
+ if [[ "$result_data" =~ ^-?[0-9]+(\.[0-9]+)?([eE][+-]?[0-9]+)?$ ]]; then
+ # Scalar result
+ {
+ echo "query,result"
+ echo "$query_name,$result_data"
+ } > "$output_file"
+ else
+ # Table result - try to convert to CSV format
+ {
+ echo "# SSB Query: $query_name"
+ echo "# Timestamp: $(date -u '+%Y-%m-%d %H:%M:%S UTC')"
+ echo "# Seed: $SEED"
+ # Convert space-separated table data to CSV
+ echo "$result_data" | sed 's/ */,/g' | sed 's/^,//g' | sed 's/,$//g'
+ } > "$output_file"
+ fi
+}
+
+save_query_result_json() {
+ local query_name="$1"
+ local result_data="$2"
+ local output_file="$OUTPUT_JSON_DIR/${query_name}.json"
+
+ # Escape quotes and special characters for JSON
+ local escaped_result=$(echo "$result_data" | sed 's/\\/\\\\/g' | sed 's/"/\\"/g' | tr '\n' ' ')
+
+ {
+ echo "{"
+ echo " \"query\": \"$query_name\","
+ echo " \"timestamp\": \"$(date -u '+%Y-%m-%d %H:%M:%S UTC')\","
+ echo " \"seed\": $SEED,"
+ echo " \"result\": \"$escaped_result\","
+ echo " \"metadata\": {"
+ echo " \"systemds_version\": \"$RUN_SYSTEMDS_VERSION\","
+ echo " \"hostname\": \"$RUN_HOSTNAME\""
+ echo " }"
+ echo "}"
+ } > "$output_file"
+}
+
+save_all_formats() {
+ local query_name="$1"
+ local result_data="$2"
+
+ save_query_result_txt "$query_name" "$result_data"
+ save_query_result_csv "$query_name" "$result_data"
+ save_query_result_json "$query_name" "$result_data"
+}
+
+# Collect metadata
+collect_system_metadata
+collect_data_metadata
+
+# Create output directory structure with timestamp-based run ID
+RUN_ID="$(date +%Y%m%d_%H%M%S)"
+create_output_structure "$RUN_ID"
+
+# Execute queries
+count=0
+failed=0
+SUCCESSFUL_QUERIES=() # Array to track successfully executed queries
+ALL_RUN_QUERIES=() # Array to track all queries that were attempted (in order)
+QUERY_STATUS=() # Array to track status: "success" or "error"
+QUERY_ERROR_MSG=() # Array to store error messages for failed queries
+QUERY_RESULTS=() # Array to track query results for display
+QUERY_FULL_RESULTS=() # Array to track complete query results for JSON
+QUERY_STATS=() # Array to track SystemDS statistics for JSON
+QUERY_TIMINGS=() # Array to track execution timing statistics
+LONG_OUTPUTS=() # Array to store long table outputs for display after summary
+
+# Progress indicator function
+progress_indicator() {
+ local query_name="$1"
+ local current="$2"
+ local total="$3"
+ echo -ne "\r[$current/$total] Running: $query_name "
+}
+
+for q in "${QUERIES[@]}"; do
+ dml="$QUERY_DIR/$q"
+ if [[ ! -f "$dml" ]]; then
+ echo "Warning: query file '$dml' not found; skipping." >&2
+ continue
+ fi
+
+ # Show progress
+ progress_indicator "$q" "$((count + failed + 1))" "${#QUERIES[@]}"
+
+ # Change to project root directory so relative paths in DML work correctly
+ cd "$PROJECT_ROOT"
+
+ # Clear progress line before showing output
+ echo -ne "\r \r"
+ echo "[$((count + failed + 1))/${#QUERIES[@]}] Running: $q"
+
+ # Record attempted query
+ ALL_RUN_QUERIES+=("$q")
+
+ if $RUN_STATS; then
+ # Capture output to extract result
+ temp_output=$(mktemp)
+ if "$SYSTEMDS_CMD" "$dml" -stats "${SYS_EXTRA_ARGS[@]}" "${NVARGS[@]}" | tee "$temp_output"; then
+ # Even when SystemDS exits 0, the DML can emit runtime errors. Detect common error markers.
+ error_msg=$(sed -n '/An Error Occurred :/,$ p' "$temp_output" | sed -n '1,200p' | tr '\n' ' ' | sed 's/^ *//;s/ *$//')
+ if [[ -n "$error_msg" ]]; then
+ echo "Error: Query $q reported runtime error" >&2
+ echo "$error_msg" >&2
+ failed=$((failed+1))
+ QUERY_STATUS+=("error")
+ QUERY_ERROR_MSG+=("$error_msg")
+ # Maintain array alignment
+ QUERY_STATS+=("")
+ QUERY_RESULTS+=("N/A")
+ QUERY_FULL_RESULTS+=("N/A")
+ LONG_OUTPUTS+=("")
+ else
+ count=$((count+1))
+ SUCCESSFUL_QUERIES+=("$q") # Track successful query
+ QUERY_STATUS+=("success")
+ # Extract result - try multiple patterns with timeouts to prevent hanging:
+ # 1. Simple scalar pattern like "REVENUE: 687752409"
+ result=$(timeout 5s grep -E "^[A-Z_]+:\s*[0-9]+" "$temp_output" | tail -1 | awk '{print $2}' 2>/dev/null || true)
+ full_result="$result" # For scalar results, display and full results are the same
+
+ # 2. If no scalar pattern, check for table output and get row count
+ if [[ -z "$result" ]]; then
+ # Look for frame info like "# FRAME: nrow = 53, ncol = 3"
+ nrows=$(timeout 5s grep "# FRAME: nrow =" "$temp_output" | awk '{print $5}' | tr -d ',' 2>/dev/null || true)
+ if [[ -n "$nrows" ]]; then
+ result="${nrows} rows (see below)"
+ # Extract and store the long output for later display (excluding statistics)
+ long_output=$(grep -v "^#" "$temp_output" | grep -v "WARNING" | grep -v "WARN" | grep -v "^$" | sed '/^SystemDS Statistics:/,$ d')
+ LONG_OUTPUTS+=("$long_output")
+ # For JSON, store the actual table data
+ full_result="$long_output"
+ else
+ # Count actual data rows (lines with numbers, excluding headers and comments) - limit to prevent hanging
+ nrows=$(timeout 5s grep -E "^[0-9]" "$temp_output" | sed '/^SystemDS Statistics:/,$ d' | head -1000 | wc -l | tr -d ' ' 2>/dev/null || echo "0")
+ if [[ "$nrows" -gt 0 ]]; then
+ result="${nrows} rows (see below)"
+ # Extract and store the long output for later display (excluding statistics)
+ long_output=$(grep -E "^[0-9]" "$temp_output" | sed '/^SystemDS Statistics:/,$ d' | head -1000)
+ LONG_OUTPUTS+=("$long_output")
+ # For JSON, store the actual table data
+ full_result="$long_output"
+ else
+ result="N/A"
+ full_result="N/A"
+ LONG_OUTPUTS+=("") # Empty placeholder to maintain array alignment
+ fi
+ fi
+ else
+ LONG_OUTPUTS+=("") # Empty placeholder for scalar results to maintain array alignment
+ fi
+ QUERY_RESULTS+=("$result") # Track query result for display
+ QUERY_FULL_RESULTS+=("$full_result") # Track complete query result for JSON
+
+ # Save result in all formats
+ query_name_clean="${q%.dml}"
+
+ # Extract and store statistics for JSON (preserving newlines)
+ stats_output=$(sed -n '/^SystemDS Statistics:/,$ p' "$temp_output")
+ QUERY_STATS+=("$stats_output") # Track statistics for JSON
+
+ save_all_formats "$query_name_clean" "$full_result"
+ fi
+ else
+ echo "Error: Query $q failed" >&2
+ failed=$((failed+1))
+ QUERY_STATUS+=("error")
+ QUERY_ERROR_MSG+=("Query execution failed (non-zero exit)")
+ # Add empty stats entry for failed queries to maintain array alignment
+ QUERY_STATS+=("")
+ fi
+ rm -f "$temp_output"
+ else
+ # Capture output to extract result
+ temp_output=$(mktemp)
+ if "$SYSTEMDS_CMD" "$dml" "${SYS_EXTRA_ARGS[@]}" "${NVARGS[@]}" | tee "$temp_output"; then
+ # Detect runtime errors in output even if command returned 0
+ error_msg=$(sed -n '/An Error Occurred :/,$ p' "$temp_output" | sed -n '1,200p' | tr '\n' ' ' | sed 's/^ *//;s/ *$//')
+ if [[ -n "$error_msg" ]]; then
+ echo "Error: Query $q reported runtime error" >&2
+ echo "$error_msg" >&2
+ failed=$((failed+1))
+ QUERY_STATUS+=("error")
+ QUERY_ERROR_MSG+=("$error_msg")
+ QUERY_STATS+=("")
+ QUERY_RESULTS+=("N/A")
+ QUERY_FULL_RESULTS+=("N/A")
+ LONG_OUTPUTS+=("")
+ else
+ count=$((count+1))
+ SUCCESSFUL_QUERIES+=("$q") # Track successful query
+ QUERY_STATUS+=("success")
+ # Extract result - try multiple patterns with timeouts to prevent hanging:
+ # 1. Simple scalar pattern like "REVENUE: 687752409"
+ result=$(timeout 5s grep -E "^[A-Z_]+:\s*[0-9]+" "$temp_output" | tail -1 | awk '{print $2}' 2>/dev/null || true)
+ full_result="$result" # For scalar results, display and full results are the same
+
+ # 2. If no scalar pattern, check for table output and get row count
+ if [[ -z "$result" ]]; then
+ # Look for frame info like "# FRAME: nrow = 53, ncol = 3"
+ nrows=$(timeout 5s grep "# FRAME: nrow =" "$temp_output" | awk '{print $5}' | tr -d ',' 2>/dev/null || true)
+ if [[ -n "$nrows" ]]; then
+ result="${nrows} rows (see below)"
+ # Extract and store the long output for later display
+ long_output=$(grep -v "^#" "$temp_output" | grep -v "WARNING" | grep -v "WARN" | grep -v "^$" | tail -n +1)
+ LONG_OUTPUTS+=("$long_output")
+ # For JSON, store the actual table data
+ full_result="$long_output"
+ else
+ # Count actual data rows (lines with numbers, excluding headers and comments) - limit to prevent hanging
+ nrows=$(timeout 5s grep -E "^[0-9]" "$temp_output" | head -1000 | wc -l | tr -d ' ' 2>/dev/null || echo "0")
+ if [[ "$nrows" -gt 0 ]]; then
+ result="${nrows} rows (see below)"
+ # Extract and store the long output for later display
+ long_output=$(grep -E "^[0-9]" "$temp_output" | head -1000)
+ LONG_OUTPUTS+=("$long_output")
+ # For JSON, store the actual table data
+ full_result="$long_output"
+ else
+ result="N/A"
+ full_result="N/A"
+ LONG_OUTPUTS+=("") # Empty placeholder to maintain array alignment
+ fi
+ fi
+ else
+ LONG_OUTPUTS+=("") # Empty placeholder for scalar results to maintain array alignment
+ fi
+ QUERY_RESULTS+=("$result") # Track query result for display
+ QUERY_FULL_RESULTS+=("$full_result") # Track complete query result for JSON
+
+ # Add empty stats entry for non-stats runs to maintain array alignment
+ QUERY_STATS+=("")
+
+ # Save result in all formats
+ query_name_clean="${q%.dml}"
+ save_all_formats "$query_name_clean" "$full_result"
+ fi
+ else
+ echo "Error: Query $q failed" >&2
+ failed=$((failed+1))
+ QUERY_STATUS+=("error")
+ QUERY_ERROR_MSG+=("Query execution failed (non-zero exit)")
+ # Add empty stats entry for failed queries to maintain array alignment
+ QUERY_STATS+=("")
+ fi
+ rm -f "$temp_output"
+ fi
+done
+
+# Summary
+echo ""
+echo "========================================="
+echo "SSB benchmark completed!"
+echo "Total queries executed: $count"
+if [[ $failed -gt 0 ]]; then
+ echo "Failed queries: $failed"
+fi
+if $RUN_STATS; then
+ echo "Statistics: enabled"
+else
+ echo "Statistics: disabled"
+fi
+
+# Display run metadata summary
+echo ""
+echo "========================================="
+echo "RUN METADATA SUMMARY"
+echo "========================================="
+echo "Timestamp: $RUN_TIMESTAMP"
+echo "Hostname: $RUN_HOSTNAME"
+echo "Seed: $SEED"
+echo ""
+echo "Software Versions:"
+echo " SystemDS: $RUN_SYSTEMDS_VERSION"
+echo " JDK: $RUN_JDK_VERSION"
+echo ""
+echo "System Resources:"
+echo " CPU: $RUN_CPU_INFO"
+echo " RAM: $RUN_RAM_INFO"
+echo ""
+echo "Data Build Info:"
+echo " SSB Data: $RUN_DATA_DISPLAY"
+echo "========================================="
+
+# Generate metadata JSON file (include all attempted queries with status and error messages)
+{
+ echo "{"
+ echo " \"benchmark_type\": \"ssb_systemds\","
+ echo " \"timestamp\": \"$RUN_TIMESTAMP\","
+ echo " \"hostname\": \"$RUN_HOSTNAME\","
+ echo " \"seed\": $SEED,"
+ echo " \"software_versions\": {"
+ echo " \"systemds\": \"$RUN_SYSTEMDS_VERSION\","
+ echo " \"jdk\": \"$RUN_JDK_VERSION\""
+ echo " },"
+ echo " \"system_resources\": {"
+ echo " \"cpu\": \"$RUN_CPU_INFO\","
+ echo " \"ram\": \"$RUN_RAM_INFO\""
+ echo " },"
+ echo -e " \"data_build_info\": $RUN_DATA_INFO,"
+ echo " \"run_configuration\": {"
+ echo " \"statistics_enabled\": $(if $RUN_STATS; then echo "true"; else echo "false"; fi),"
+ echo " \"queries_selected\": ${#QUERIES[@]},"
+ echo " \"queries_executed\": $count,"
+ echo " \"queries_failed\": $failed"
+ echo " },"
+ echo " \"results\": ["
+ for i in "${!ALL_RUN_QUERIES[@]}"; do
+ query="${ALL_RUN_QUERIES[$i]}"
+ status="${QUERY_STATUS[$i]:-error}"
+ error_msg="${QUERY_ERROR_MSG[$i]:-}"
+ # Find matching full_result and stats by searching SUCCESSFUL_QUERIES index
+ full_result=""
+ stats_result=""
+ if [[ "$status" == "success" ]]; then
+ # Find index in SUCCESSFUL_QUERIES
+ for j in "${!SUCCESSFUL_QUERIES[@]}"; do
+ if [[ "${SUCCESSFUL_QUERIES[$j]}" == "$query" ]]; then
+ full_result="${QUERY_FULL_RESULTS[$j]}"
+ stats_result="${QUERY_STATS[$j]}"
+ break
+ fi
+ done
+ fi
+ # Escape quotes and newlines for JSON
+ escaped_result=$(echo "$full_result" | sed 's/\\/\\\\/g' | sed 's/"/\\"/g' | tr '\n' ' ')
+ escaped_error=$(echo "$error_msg" | sed 's/\\/\\\\/g' | sed 's/"/\\"/g' | tr '\n' ' ')
+
+ echo " {"
+ echo " \"query\": \"${query%.dml}\","
+ echo " \"status\": \"$status\","
+ echo " \"error_message\": \"$escaped_error\","
+ echo " \"result\": \"$escaped_result\""
+ if [[ -n "$stats_result" ]]; then
+ echo " ,\"stats\": ["
+ echo "$stats_result" | sed 's/\\/\\\\/g' | sed 's/"/\\"/g' | sed 's/\t/ /g' | awk '
+ BEGIN { first = 1 }
+ {
+ if (!first) printf ",\n"
+ printf " \"%s\"", $0
+ first = 0
+ }
+ END { if (!first) printf "\n" }
+ '
+ echo " ]"
+ fi
+ if [[ $i -lt $((${#ALL_RUN_QUERIES[@]} - 1)) ]]; then
+ echo " },"
+ else
+ echo " }"
+ fi
+ done
+ echo " ]"
+ echo "}"
+} > "$OUTPUT_METADATA_FILE"
+
+echo ""
+echo "Metadata saved to $OUTPUT_METADATA_FILE"
+echo "Output directory: $OUTPUT_BASE_DIR"
+echo " - TXT files: $OUTPUT_TXT_DIR"
+echo " - CSV files: $OUTPUT_CSV_DIR"
+echo " - JSON files: $OUTPUT_JSON_DIR"
+
+# Detailed per-query summary (show status and error messages if any)
+if [[ ${#ALL_RUN_QUERIES[@]} -gt 0 ]]; then
+ echo ""
+ echo "==================================================="
+ echo "QUERIES SUMMARY"
+ echo "==================================================="
+ printf "%-4s %-15s %-30s %s\n" "No." "Query" "Result" "Status"
+ echo "---------------------------------------------------"
+ for i in "${!ALL_RUN_QUERIES[@]}"; do
+ query="${ALL_RUN_QUERIES[$i]}"
+ query_display="${query%.dml}" # Remove .dml extension for display
+ status="${QUERY_STATUS[$i]:-error}"
+ if [[ "$status" == "success" ]]; then
+ # Find index in SUCCESSFUL_QUERIES to fetch result
+ result=""
+ for j in "${!SUCCESSFUL_QUERIES[@]}"; do
+ if [[ "${SUCCESSFUL_QUERIES[$j]}" == "$query" ]]; then
+ result="${QUERY_RESULTS[$j]}"
+ break
+ fi
+ done
+ printf "%-4d %-15s %-30s %s\n" "$((i+1))" "$query_display" "$result" "✓ Success"
+ else
+ err="${QUERY_ERROR_MSG[$i]:-Unknown error}"
+ printf "%-4d %-15s %-30s %s\n" "$((i+1))" "$query_display" "N/A" "ERROR: ${err}"
+ fi
+ done
+echo "==================================================="
+fi
+
+# Display long outputs for queries that had table results
+if [[ ${#SUCCESSFUL_QUERIES[@]} -gt 0 ]]; then
+ # Check if we have any long outputs to display
+ has_long_outputs=false
+ for i in "${!LONG_OUTPUTS[@]}"; do
+ if [[ -n "${LONG_OUTPUTS[$i]}" ]]; then
+ has_long_outputs=true
+ break
+ fi
+ done
+
+ if $has_long_outputs; then
+ echo ""
+ echo "========================================="
+ echo "DETAILED QUERY RESULTS"
+ echo "========================================="
+ for i in "${!SUCCESSFUL_QUERIES[@]}"; do
+ if [[ -n "${LONG_OUTPUTS[$i]}" ]]; then
+ query="${SUCCESSFUL_QUERIES[$i]}"
+ query_display="${query%.dml}" # Remove .dml extension for display
+ echo ""
+ echo "[$((i+1))] Results for $query_display:"
+ echo "----------------------------------------"
+ echo "${LONG_OUTPUTS[$i]}"
+ echo "----------------------------------------"
+ fi
+ done
+ echo "========================================="
+ fi
+fi
+
+# Exit with appropriate code
+if [[ $failed -gt 0 ]]; then
+ exit 1
+fi
diff --git a/scripts/ssb/sql/q1.1.sql b/scripts/ssb/sql/q1.1.sql
new file mode 100644
index 00000000000..02e3844d12c
--- /dev/null
+++ b/scripts/ssb/sql/q1.1.sql
@@ -0,0 +1,7 @@
+SELECT SUM(lo_extendedprice * lo_discount) AS REVENUE
+FROM lineorder, dates
+WHERE
+ lo_orderdate = d_datekey
+ AND d_year = 1993
+ AND lo_discount BETWEEN 1 AND 3
+ AND lo_quantity < 25;
\ No newline at end of file
diff --git a/scripts/ssb/sql/q1.2.sql b/scripts/ssb/sql/q1.2.sql
new file mode 100644
index 00000000000..834d73f623f
--- /dev/null
+++ b/scripts/ssb/sql/q1.2.sql
@@ -0,0 +1,7 @@
+SELECT SUM(lo_extendedprice * lo_discount) AS REVENUE
+FROM lineorder, dates
+WHERE
+ lo_orderdate = d_datekey
+ AND d_yearmonth = 'Jan1994'
+ AND lo_discount BETWEEN 4 AND 6
+ AND lo_quantity BETWEEN 26 AND 35;
\ No newline at end of file
diff --git a/scripts/ssb/sql/q1.3.sql b/scripts/ssb/sql/q1.3.sql
new file mode 100644
index 00000000000..7a09490b840
--- /dev/null
+++ b/scripts/ssb/sql/q1.3.sql
@@ -0,0 +1,9 @@
+SELECT
+ SUM(lo_extendedprice * lo_discount) AS REVENUE
+FROM lineorder, dates
+WHERE
+ lo_orderdate = d_datekey
+ AND d_weeknuminyear = 6
+ AND d_year = 1994
+ AND lo_discount BETWEEN 5 AND 7
+ AND lo_quantity BETWEEN 26 AND 35;
\ No newline at end of file
diff --git a/scripts/ssb/sql/q2.1.sql b/scripts/ssb/sql/q2.1.sql
new file mode 100644
index 00000000000..f455ff9e935
--- /dev/null
+++ b/scripts/ssb/sql/q2.1.sql
@@ -0,0 +1,10 @@
+SELECT SUM(lo_revenue), d_year, p_brand
+FROM lineorder, dates, part, supplier
+WHERE
+ lo_orderdate = d_datekey
+ AND lo_partkey = p_partkey
+ AND lo_suppkey = s_suppkey
+ AND p_category = 'MFGR#12'
+ AND s_region = 'AMERICA'
+GROUP BY d_year, p_brand
+ORDER BY p_brand;
\ No newline at end of file
diff --git a/scripts/ssb/sql/q2.2.sql b/scripts/ssb/sql/q2.2.sql
new file mode 100644
index 00000000000..e28d55153c2
--- /dev/null
+++ b/scripts/ssb/sql/q2.2.sql
@@ -0,0 +1,10 @@
+SELECT SUM(lo_revenue), d_year, p_brand
+FROM lineorder, dates, part, supplier
+WHERE
+ lo_orderdate = d_datekey
+ AND lo_partkey = p_partkey
+ AND lo_suppkey = s_suppkey
+ AND p_brand BETWEEN 'MFGR#2221' AND 'MFGR#2228'
+ AND s_region = 'ASIA'
+GROUP BY d_year, p_brand
+ORDER BY d_year, p_brand;
\ No newline at end of file
diff --git a/scripts/ssb/sql/q2.3.sql b/scripts/ssb/sql/q2.3.sql
new file mode 100644
index 00000000000..8ec135cef0a
--- /dev/null
+++ b/scripts/ssb/sql/q2.3.sql
@@ -0,0 +1,10 @@
+SELECT SUM(lo_revenue), d_year, p_brand
+FROM lineorder, dates, part, supplier
+WHERE
+ lo_orderdate = d_datekey
+ AND lo_partkey = p_partkey
+ AND lo_suppkey = s_suppkey
+ AND p_brand = 'MFGR#2239'
+ AND s_region = 'EUROPE'
+GROUP BY d_year, p_brand
+ORDER BY d_year, p_brand;
\ No newline at end of file
diff --git a/scripts/ssb/sql/q3.1.sql b/scripts/ssb/sql/q3.1.sql
new file mode 100644
index 00000000000..badd93f973a
--- /dev/null
+++ b/scripts/ssb/sql/q3.1.sql
@@ -0,0 +1,16 @@
+SELECT
+ c_nation,
+ s_nation,
+ d_year,
+ SUM(lo_revenue) AS REVENUE
+FROM customer, lineorder, supplier, dates
+WHERE
+ lo_custkey = c_custkey
+ AND lo_suppkey = s_suppkey
+ AND lo_orderdate = d_datekey
+ AND c_region = 'ASIA'
+ AND s_region = 'ASIA'
+ AND d_year >= 1992
+ AND d_year <= 1997
+GROUP BY c_nation, s_nation, d_year
+ORDER BY d_year ASC, REVENUE DESC;
\ No newline at end of file
diff --git a/scripts/ssb/sql/q3.2.sql b/scripts/ssb/sql/q3.2.sql
new file mode 100644
index 00000000000..fc5564d3b6e
--- /dev/null
+++ b/scripts/ssb/sql/q3.2.sql
@@ -0,0 +1,16 @@
+SELECT
+ c_city,
+ s_city,
+ d_year,
+ SUM(lo_revenue) AS REVENUE
+FROM customer, lineorder, supplier, dates
+WHERE
+ lo_custkey = c_custkey
+ AND lo_suppkey = s_suppkey
+ AND lo_orderdate = d_datekey
+ AND c_nation = 'UNITED STATES'
+ AND s_nation = 'UNITED STATES'
+ AND d_year >= 1992
+ AND d_year <= 1997
+GROUP BY c_city, s_city, d_year
+ORDER BY d_year ASC, REVENUE DESC;
\ No newline at end of file
diff --git a/scripts/ssb/sql/q3.3.sql b/scripts/ssb/sql/q3.3.sql
new file mode 100644
index 00000000000..5fdfdf39eae
--- /dev/null
+++ b/scripts/ssb/sql/q3.3.sql
@@ -0,0 +1,22 @@
+SELECT
+ c_city,
+ s_city,
+ d_year,
+ SUM(lo_revenue) AS REVENUE
+FROM customer, lineorder, supplier, dates
+WHERE
+ lo_custkey = c_custkey
+ AND lo_suppkey = s_suppkey
+ AND lo_orderdate = d_datekey
+ AND (
+ c_city = 'UNITED KI1'
+ OR c_city = 'UNITED KI5'
+ )
+ AND (
+ s_city = 'UNITED KI1'
+ OR s_city = 'UNITED KI5'
+ )
+ AND d_year >= 1992
+ AND d_year <= 1997
+GROUP BY c_city, s_city, d_year
+ORDER BY d_year ASC, REVENUE DESC;
\ No newline at end of file
diff --git a/scripts/ssb/sql/q3.4.sql b/scripts/ssb/sql/q3.4.sql
new file mode 100644
index 00000000000..a94a81795f5
--- /dev/null
+++ b/scripts/ssb/sql/q3.4.sql
@@ -0,0 +1,21 @@
+SELECT
+ c_city,
+ s_city,
+ d_year,
+ SUM(lo_revenue) AS REVENUE
+FROM customer, lineorder, supplier, dates
+WHERE
+ lo_custkey = c_custkey
+ AND lo_suppkey = s_suppkey
+ AND lo_orderdate = d_datekey
+ AND (
+ c_city = 'UNITED KI1'
+ OR c_city = 'UNITED KI5'
+ )
+ AND (
+ s_city = 'UNITED KI1'
+ OR s_city = 'UNITED KI5'
+ )
+ AND d_yearmonth = 'Dec1997'
+GROUP BY c_city, s_city, d_year
+ORDER BY d_year ASC, REVENUE DESC;
\ No newline at end of file
diff --git a/scripts/ssb/sql/q4.1.sql b/scripts/ssb/sql/q4.1.sql
new file mode 100644
index 00000000000..a7d48bfe436
--- /dev/null
+++ b/scripts/ssb/sql/q4.1.sql
@@ -0,0 +1,18 @@
+SELECT
+ d_year,
+ c_nation,
+ SUM(lo_revenue - lo_supplycost) AS PROFIT
+FROM dates, customer, supplier, part, lineorder
+WHERE
+ lo_custkey = c_custkey
+ AND lo_suppkey = s_suppkey
+ AND lo_partkey = p_partkey
+ AND lo_orderdate = d_datekey
+ AND c_region = 'AMERICA'
+ AND s_region = 'AMERICA'
+ AND (
+ p_mfgr = 'MFGR#1'
+ OR p_mfgr = 'MFGR#2'
+ )
+GROUP BY d_year, c_nation
+ORDER BY d_year, c_nation;
diff --git a/scripts/ssb/sql/q4.2.sql b/scripts/ssb/sql/q4.2.sql
new file mode 100644
index 00000000000..1c68951d58d
--- /dev/null
+++ b/scripts/ssb/sql/q4.2.sql
@@ -0,0 +1,23 @@
+SELECT
+ d_year,
+ s_nation,
+ p_category,
+ SUM(lo_revenue - lo_supplycost) AS PROFIT
+FROM dates, customer, supplier, part, lineorder
+WHERE
+ lo_custkey = c_custkey
+ AND lo_suppkey = s_suppkey
+ AND lo_partkey = p_partkey
+ AND lo_orderdate = d_datekey
+ AND c_region = 'AMERICA'
+ AND s_region = 'AMERICA'
+ AND (
+ d_year = 1997
+ OR d_year = 1998
+ )
+ AND (
+ p_mfgr = 'MFGR#1'
+ OR p_mfgr = 'MFGR#2'
+ )
+GROUP BY d_year, s_nation, p_category
+ORDER BY d_year, s_nation, p_category;
\ No newline at end of file
diff --git a/scripts/ssb/sql/q4.3.sql b/scripts/ssb/sql/q4.3.sql
new file mode 100644
index 00000000000..815ab2d8a56
--- /dev/null
+++ b/scripts/ssb/sql/q4.3.sql
@@ -0,0 +1,19 @@
+SELECT
+ d_year,
+ s_city,
+ p_brand,
+ SUM(lo_revenue - lo_supplycost) AS PROFIT
+FROM dates, customer, supplier, part, lineorder
+WHERE
+ lo_custkey = c_custkey
+ AND lo_suppkey = s_suppkey
+ AND lo_partkey = p_partkey
+ AND lo_orderdate = d_datekey
+ AND s_nation = 'UNITED STATES'
+ AND (
+ d_year = 1997
+ OR d_year = 1998
+ )
+ AND p_category = 'MFGR#14'
+GROUP BY d_year, s_city, p_brand
+ORDER BY d_year, s_city, p_brand;
\ No newline at end of file
diff --git a/spark_config.xml b/spark_config.xml
new file mode 100644
index 00000000000..8db991ba42d
--- /dev/null
+++ b/spark_config.xml
@@ -0,0 +1,12 @@
+
+
+
+
+ SSB_Q1_1_Test
+ local[*]
+ 4g
+ 4g
+ 2g
+
+
+
\ No newline at end of file
diff --git a/src/main/java/org/apache/sysds/hops/BinaryOp.java b/src/main/java/org/apache/sysds/hops/BinaryOp.java
index 2b803a053c1..4dd5e1f243d 100644
--- a/src/main/java/org/apache/sysds/hops/BinaryOp.java
+++ b/src/main/java/org/apache/sysds/hops/BinaryOp.java
@@ -854,6 +854,9 @@ else if( (op == OpOp2.CBIND && getDataType().isList())
_etype = ExecType.CP;
}
+ if( _etype == ExecType.OOC ) //TODO
+ setExecType(ExecType.CP);
+
//mark for recompile (forever)
setRequiresRecompileIfNecessary();
diff --git a/src/main/java/org/apache/sysds/runtime/controlprogram/caching/CacheableData.java b/src/main/java/org/apache/sysds/runtime/controlprogram/caching/CacheableData.java
index 34a8aa18631..67f9f698a97 100644
--- a/src/main/java/org/apache/sysds/runtime/controlprogram/caching/CacheableData.java
+++ b/src/main/java/org/apache/sysds/runtime/controlprogram/caching/CacheableData.java
@@ -73,8 +73,8 @@
/**
* Each object of this class is a cache envelope for some large piece of data
- * called "cache block". For example, the body of a matrix can be the cache block.
- * The term cache block refers strictly to the cacheable portion of the data object,
+ * called "cache block". For example, the body of a matrix can be the cache block.
+ * The term cache block refers strictly to the cacheable portion of the data object,
* often excluding metadata and auxiliary parameters, as defined in the subclasses.
* Under the protection of the envelope, the data blob may be evicted to
* the file system; then the subclass must set its reference to null
@@ -96,43 +96,43 @@ public abstract class CacheableData> extends Data
public static final String CACHING_EVICTION_FILEEXTENSION = ".dat";
public static final boolean CACHING_ASYNC_FILECLEANUP = true;
public static boolean CACHING_ASYNC_SERIALIZE = false;
-
+
//NOTE CACHING_ASYNC_SERIALIZE:
- // The serialization of matrices and frames (ultra-sparse matrices or
- // frames with strings) into buffer pool byte arrays happens outside the
+ // The serialization of matrices and frames (ultra-sparse matrices or
+ // frames with strings) into buffer pool byte arrays happens outside the
// critical region of the global lock in LazyWriteBuffer. However, it still
- // requires thread-local serialization (before returning from release) in
- // order to guarantee that not too many objects are pinned at the same time
- // which would violate the memory budget. Therefore, the new asynchronous
+ // requires thread-local serialization (before returning from release) in
+ // order to guarantee that not too many objects are pinned at the same time
+ // which would violate the memory budget. Therefore, the new asynchronous
// serialization (see CACHING_ASYNC_SERIALIZE) should be understood as
// optimistic with weaker guarantees.
-
+
/**
* Defines all possible cache status types for a data blob.
* An object of class {@link CacheableData} can be in one of the following
* five status types:
*
- * EMPTY: Either there is no data blob at all, or the data blob
+ * EMPTY: Either there is no data blob at all, or the data blob
* resides in a specified import file and has never been downloaded yet.
* READ: The data blob is in main memory; one or more threads are
* referencing and reading it (shared "read-only" lock). This status uses a
* counter. Eviction is NOT allowed.
* MODIFY: The data blob is in main memory; exactly one thread is
* referencing and modifying it (exclusive "write" lock). Eviction is NOT allowed.
- * CACHED: The data blob is in main memory, and nobody is using nor referencing it.
+ * CACHED: The data blob is in main memory, and nobody is using nor referencing it.
* There is always an persistent recovery object for it
**/
public enum CacheStatus {
- EMPTY,
- READ,
- MODIFY,
+ EMPTY,
+ READ,
+ MODIFY,
CACHED,
CACHED_NOWRITE,
}
-
+
/** Global flag indicating if caching is enabled (controls eviction) */
private static volatile boolean _activeFlag = false;
-
+
/** Global sequence for generating unique ids. */
private static IDSequence _seq = null;
@@ -147,9 +147,9 @@ public enum CacheStatus {
@Override protected Long initialValue() { return 0L; }
};
- //current size of live broadcast objects (because Spark's ContextCleaner maintains
- //a buffer with references to prevent eager cleanup by GC); note that this is an
- //overestimate, because we maintain partitioned broadcasts as soft references, which
+ //current size of live broadcast objects (because Spark's ContextCleaner maintains
+ //a buffer with references to prevent eager cleanup by GC); note that this is an
+ //overestimate, because we maintain partitioned broadcasts as soft references, which
//might be collected by the GC and subsequently cleaned up by Spark's ContextCleaner.
private static final AtomicLong _refBCs = new AtomicLong(0);
@@ -159,16 +159,16 @@ public enum CacheStatus {
/**
* The unique (JVM-wide) ID of a cacheable data object; to ensure unique IDs across JVMs, we
- * concatenate filenames with a unique prefix (map task ID).
+ * concatenate filenames with a unique prefix (map task ID).
*/
private final long _uniqueID;
-
+
/** The cache status of the data blob (whether it can be or is evicted, etc. */
private CacheStatus _cacheStatus = null;
-
+
/** Cache for actual data, evicted by garbage collector. */
protected SoftReference _cache = null;
-
+
/** Container object that holds the actual data. */
protected T _data = null;
@@ -177,47 +177,47 @@ public enum CacheStatus {
* includes: 1) Matrix dimensions, if available 2) Number of non-zeros, if
* available 3) Block dimensions, if applicable 4) InputInfo -- subsequent
* operations that use this Matrix expect it to be in this format.
- *
+ *
* When the matrix is written to HDFS (local file system, as well?), one
* must get the OutputInfo that matches with InputInfo stored inside _mtd.
*/
protected MetaData _metaData = null;
-
+
protected FederationMap _fedMapping = null;
protected boolean _compressed = false;
protected long _compressedSize = -1;
-
+
/** The name of HDFS file in which the data is backed up. */
protected String _hdfsFileName = null; // file name and path
protected boolean _isPRead = false; //persistent read, must not be deleted
-
- /**
- * Flag that indicates whether or not hdfs file exists.It is used
- * for improving the performance of "rmvar" instruction. When it has
- * value false, one can skip file system existence
+
+ /**
+ * Flag that indicates whether or not hdfs file exists.It is used
+ * for improving the performance of "rmvar" instruction. When it has
+ * value false, one can skip file system existence
* checks which can be expensive.
*/
- private boolean _hdfsFileExists = false;
+ private boolean _hdfsFileExists = false;
/** Information relevant to specific external file formats. */
private FileFormatProperties _formatProps = null;
-
+
/**
* true if the in-memory or evicted matrix may be different from
* the matrix located at {@link #_hdfsFileName}; false if the two
* matrices should be the same.
*/
private boolean _dirtyFlag = false;
-
+
// additional private flags and meta data
private int _numReadThreads = 0; //number of threads for read from HDFS
- private boolean _cleanupFlag = true; //flag if obj unpinned (cleanup enabled)
+ private boolean _cleanupFlag = true; //flag if obj unpinned (cleanup enabled)
private String _cacheFileName = null; //local eviction file name
private boolean _requiresLocalWrite = false; //flag if local write for read obj
- private boolean _isAcquireFromEmpty = false; //flag if read from status empty
-
+ private boolean _isAcquireFromEmpty = false; //flag if read from status empty
+
//backend-specific handles
//note: we use the abstraction of LineageObjects for two reasons: (1) to keep track of cleanup
//for lazily evaluated RDDs, and (2) as abstraction for environments that do not necessarily have spark libraries available
@@ -225,13 +225,13 @@ public enum CacheStatus {
private BroadcastObject _bcHandle = null; //Broadcast handle
protected HashMap _gpuObjects = null; //Per GPUContext object allocated on GPU
//TODO generalize for frames
- private OOCStreamable _streamHandle = null;
-
+ private LocalTaskQueue _streamHandle = null;
+
private LineageItem _lineage = null;
-
+
/**
* Basic constructor for any cacheable data.
- *
+ *
* @param dt data type
* @param vt value type
*/
@@ -242,28 +242,28 @@ protected CacheableData(DataType dt, ValueType vt) {
_numReadThreads = 0;
_gpuObjects = DMLScript.USE_ACCELERATOR ? new HashMap<>() : null;
}
-
+
/**
* Copy constructor for cacheable data (of same type).
- *
+ *
* @param that cacheable data object
*/
protected CacheableData(CacheableData that) {
this( that.getDataType(), that.getValueType() );
_cleanupFlag = that._cleanupFlag;
_hdfsFileName = that._hdfsFileName;
- _hdfsFileExists = that._hdfsFileExists;
+ _hdfsFileExists = that._hdfsFileExists;
_gpuObjects = that._gpuObjects;
_dirtyFlag = that._dirtyFlag;
_compressed = that._compressed;
_compressedSize = that._compressedSize;
_fedMapping = that._fedMapping;
}
-
+
/**
- * Enables or disables the cleanup of the associated
+ * Enables or disables the cleanup of the associated
* data object on clearData().
- *
+ *
* @param flag true if cleanup
*/
public void enableCleanup(boolean flag) {
@@ -271,15 +271,15 @@ public void enableCleanup(boolean flag) {
}
/**
- * Indicates if cleanup of the associated data object
+ * Indicates if cleanup of the associated data object
* is enabled on clearData().
- *
+ *
* @return true if cleanup enabled
*/
public boolean isCleanupEnabled() {
return _cleanupFlag;
}
-
+
public CacheStatus getStatus() {
return _cacheStatus;
}
@@ -295,15 +295,15 @@ public void setHDFSFileExists( boolean flag ) {
public String getFileName() {
return _hdfsFileName;
}
-
+
public boolean isPersistentRead() {
return _isPRead;
}
-
+
public void setPersistentRead(boolean pread) {
_isPRead = pread;
}
-
+
public long getUniqueID() {
return _uniqueID;
}
@@ -314,12 +314,12 @@ public synchronized void setFileName( String file ) {
_dirtyFlag = true;
_hdfsFileName = file;
}
-
+
/**
* true if the in-memory or evicted matrix may be different from
* the matrix located at {@link #_hdfsFileName}; false if the two
* matrices are supposed to be the same.
- *
+ *
* @return true if dirty
*/
public boolean isDirty() {
@@ -337,7 +337,7 @@ public FileFormatProperties getFileFormatProperties() {
public void setFileFormatProperties(FileFormatProperties props) {
_formatProps = props;
}
-
+
@Override
public void setMetaData(MetaData md) {
_metaData = md;
@@ -351,7 +351,7 @@ public void setCompressedSize(long size){
public boolean isCompressed(){
return _compressed;
}
-
+
public long getCompressedSize(){
return _compressedSize;
}
@@ -365,7 +365,7 @@ public MetaData getMetaData() {
public void removeMetaData() {
_metaData = null;
}
-
+
public DataCharacteristics getDataCharacteristics() {
return _metaData.getDataCharacteristics();
}
@@ -381,11 +381,11 @@ public long getNumRows() {
public long getNumColumns() {
return getDataCharacteristics().getCols();
}
-
+
public int getBlocksize() {
return getDataCharacteristics().getBlocksize();
}
-
+
public abstract void refreshMetaData();
public LineageItem getCacheLineage() {
@@ -419,15 +419,15 @@ public boolean isFederated() {
}
return _fedMapping != null;
}
-
+
public boolean isFederated(FType type) {
return isFederated() && (type == null || _fedMapping.getType().isType(type));
}
-
+
public boolean isFederatedExcept(FType type) {
return isFederated() && !isFederated(type);
}
-
+
/**
* Gets the mapping of indices ranges to federated objects.
* @return fedMapping mapping
@@ -435,7 +435,7 @@ public boolean isFederatedExcept(FType type) {
public FederationMap getFedMapping() {
return _fedMapping;
}
-
+
/**
* Sets the mapping of indices ranges to federated objects.
* @param fedMapping mapping
@@ -443,7 +443,7 @@ public FederationMap getFedMapping() {
public void setFedMapping(FederationMap fedMapping) {
_fedMapping = fedMapping;
}
-
+
public RDDObject getRDDHandle() {
return _rddHandle;
}
@@ -452,7 +452,7 @@ public void setRDDHandle( RDDObject rdd ) {
//cleanup potential old back reference
if( _rddHandle != null )
_rddHandle.setBackReference(null);
-
+
//add new rdd handle
_rddHandle = rdd;
if( _rddHandle != null )
@@ -462,7 +462,7 @@ public void setRDDHandle( RDDObject rdd ) {
public boolean hasRDDHandle() {
return _rddHandle != null && _rddHandle.hasBackReference();
}
-
+
public BroadcastObject getBroadcastHandle() {
return _bcHandle;
}
@@ -470,44 +470,17 @@ public BroadcastObject getBroadcastHandle() {
public boolean hasBroadcastHandle() {
return _bcHandle != null && _bcHandle.hasBackReference();
}
-
- public OOCStream getStreamHandle() {
- if( !hasStreamHandle() ) {
- final SubscribableTaskQueue _mStream = new SubscribableTaskQueue<>();
- _streamHandle = _mStream;
- DataCharacteristics dc = getDataCharacteristics();
- MatrixBlock src = (MatrixBlock)acquireReadAndRelease();
- LongStream.range(0, dc.getNumBlocks())
- .mapToObj(i -> UtilFunctions.createIndexedMatrixBlock(src, dc, i))
- .forEach( blk -> {
- try{
- _mStream.enqueue(blk);
- }
- catch(Exception ex) {
- throw ex instanceof DMLRuntimeException ? (DMLRuntimeException) ex : new DMLRuntimeException(ex);
- }});
- _mStream.closeInput();
- }
-
- return _streamHandle.getReadStream();
+
+ public LocalTaskQueue getStreamHandle() {
+ return _streamHandle;
}
-
- /**
- * Probes if stream handle is existing, because getStreamHandle
- * creates a new stream if not existing.
- *
- * @return true if existing, false otherwise
- */
- public boolean hasStreamHandle() {
- return _streamHandle != null && !_streamHandle.isProcessed();
- }
@SuppressWarnings({ "rawtypes", "unchecked" })
public void setBroadcastHandle( BroadcastObject bc ) {
//cleanup potential old back reference
if( _bcHandle != null )
_bcHandle.setBackReference(null);
-
+
//add new broadcast handle
_bcHandle = bc;
if( _bcHandle != null )
@@ -527,15 +500,15 @@ public synchronized void setGPUObject(GPUContext gCtx, GPUObject gObj) {
if (old != null)
throw new DMLRuntimeException("GPU : Inconsistent internal state - this CacheableData already has a GPUObject assigned to the current GPUContext (" + gCtx + ")");
}
-
+
public synchronized void removeGPUObject(GPUContext gCtx) {
_gpuObjects.remove(gCtx);
}
- public synchronized void setStreamHandle(OOCStreamable q) {
+ public synchronized void setStreamHandle(LocalTaskQueue q) {
_streamHandle = q;
}
-
+
// *********************************************
// *** ***
// *** HIGH-LEVEL METHODS THAT SPECIFY ***
@@ -548,38 +521,38 @@ public T acquireReadAndRelease() {
release();
return tmp;
}
-
+
/**
* Acquires a shared "read-only" lock, produces the reference to the cache block,
* restores the cache block to main memory, reads from HDFS if needed.
- *
+ *
* Synchronized because there might be parallel threads (parfor local) that
* access the same object (in case it was created before the loop).
- *
+ *
* In-Status: EMPTY, EVICTABLE, EVICTED, READ;
* Out-Status: READ(+1).
- *
+ *
* @return cacheable data
*/
public T acquireRead() {
long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0;
-
+
//core internal acquire (synchronized per object)
T ret = acquireReadIntern();
-
+
//update thread-local status (after pin but outside the
//critical section of accessing a shared object)
if( !isBelowCachingThreshold() )
updateStatusPinned(true);
-
+
if( DMLScript.STATISTICS ){
long t1 = System.nanoTime();
CacheStatistics.incrementAcquireRTime(t1-t0);
}
-
+
return ret;
}
-
+
private synchronized T acquireReadIntern() {
if ( !isAvailableToRead() )
throw new DMLRuntimeException("MatrixObject not available to read.");
@@ -591,7 +564,7 @@ private synchronized T acquireReadIntern() {
if (OptimizerUtils.isUMMEnabled())
//track and make space in the UMM
UnifiedMemoryManager.pin(this);
-
+
//call acquireHostRead if gpuHandle is set as well as is allocated
if( DMLScript.USE_ACCELERATOR && _gpuObjects != null ) {
boolean copiedFromGPU = false;
@@ -606,7 +579,7 @@ else if (gObj != null) {
}
}
}
-
+
//read data from HDFS/RDD if required
//(probe data for cache_nowrite / jvm_reuse)
if( _data==null && ( isEmpty(true) || hasValidLineage() )) {
@@ -625,20 +598,20 @@ && getRDDHandle() == null) ) {
//mark for initial local write despite read operation
_requiresLocalWrite = false;
}
- else if( hasStreamHandle() ) {
- _data = readBlobFromStream( getStreamHandle().toLocalTaskQueue() );
+ else if( getStreamHandle() != null ) {
+ _data = readBlobFromStream( getStreamHandle() );
}
else if( getRDDHandle()==null || getRDDHandle().allowsShortCircuitRead() ) {
if( DMLScript.STATISTICS )
CacheStatistics.incrementHDFSHits();
-
+
//check filename
if( _hdfsFileName == null )
throw new DMLRuntimeException("Cannot read matrix for empty filename.");
-
+
//read cacheable data from hdfs
_data = readBlobFromHDFS( _hdfsFileName );
-
+
//mark for initial local write despite read operation
_requiresLocalWrite = false;
}
@@ -646,11 +619,11 @@ else if( getRDDHandle()==null || getRDDHandle().allowsShortCircuitRead() ) {
//read matrix from rdd (incl execute pending rdd operations)
MutableBoolean writeStatus = new MutableBoolean();
_data = readBlobFromRDD( getRDDHandle(), writeStatus );
-
+
//mark for initial local write (prevent repeated execution of rdd operations)
_requiresLocalWrite = !writeStatus.booleanValue();
}
-
+
setDirty(false);
}
catch (IOException e) {
@@ -667,7 +640,7 @@ else if( _data!=null && DMLScript.STATISTICS ) {
return _data;
}
-
+
/**
* Acquires the exclusive "write" lock for a thread that wants to throw away the
* old cache block data and link up with new cache block data. Abandons the old data
@@ -675,93 +648,93 @@ else if( _data!=null && DMLScript.STATISTICS ) {
* In-Status: EMPTY, EVICTABLE, EVICTED;
* Out-Status: MODIFY.
- *
+ *
* @param newData new data
* @return cacheable data
*/
public T acquireModify(T newData) {
long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0;
-
+
//core internal acquire (synchronized per object)
T ret = acquireModifyIntern(newData);
-
+
//update thread-local status (after pin but outside the
//critical section of accessing a shared object)
if( !isBelowCachingThreshold() )
updateStatusPinned(true);
-
+
if( DMLScript.STATISTICS ){
long t1 = System.nanoTime();
CacheStatistics.incrementAcquireMTime(t1-t0);
if (DMLScript.JMLC_MEM_STATISTICS)
Statistics.addCPMemObject(System.identityHashCode(this), getDataSize());
}
-
+
if(newData instanceof CompressedMatrixBlock) {
setCompressedSize(newData.getInMemorySize());
}
return ret;
}
-
+
private synchronized T acquireModifyIntern(T newData) {
if (! isAvailableToModify ())
throw new DMLRuntimeException("CacheableData not available to modify.");
-
+
//clear old data
clearData();
-
+
//cache status maintenance
acquire (true, false); //no need to load evicted matrix
-
+
setDirty(true);
_isAcquireFromEmpty = false;
-
+
//set references to new data
if (newData == null)
throw new DMLRuntimeException("acquireModify with empty cache block.");
return _data = newData;
}
-
+
/**
* Releases the shared ("read-only") or exclusive ("write") lock. Updates
* size information, last-access time, metadata, etc.
- *
+ *
* Synchronized because there might be parallel threads (parfor local) that
* access the same object (in case it was created before the loop).
- *
+ *
* In-Status: READ, MODIFY;
* Out-Status: READ(-1), EVICTABLE, EMPTY.
- *
+ *
*/
public void release() {
long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0;
-
+
//update thread-local status (before unpin but outside
//the critical section of accessing a shared object)
if( !isBelowCachingThreshold() )
updateStatusPinned(false);
-
+
//core internal release (synchronized per object)
releaseIntern();
-
+
if( DMLScript.STATISTICS ){
long t1 = System.nanoTime();
CacheStatistics.incrementReleaseTime(t1-t0);
}
}
-
+
private synchronized void releaseIntern() {
boolean write = false;
if ( isModify() ) {
//set flags for write
write = true;
setDirty(true);
-
+
//update meta data
refreshMetaData();
-
- //compact empty in-memory block
+
+ //compact empty in-memory block
_data.compactEmptyBlock();
}
@@ -771,7 +744,7 @@ private synchronized void releaseIntern() {
//cache status maintenance (pass cacheNoWrite flag)
release(_isAcquireFromEmpty && !_requiresLocalWrite);
-
+
if( isCachingActive() //only if caching is enabled (otherwise keep everything in mem)
&& isCached(true) //not empty and not read/modify
&& !isBelowCachingThreshold() ) //min size for caching
@@ -793,39 +766,39 @@ && isCached(true) //not empty and not read/modify
if( DMLScript.STATISTICS && write && hasValidLineage() )
CacheStatistics.incrementLinWrites();
-
+
//create cache
createCache();
_data = null;
}
}
-
+
public void clearData() {
clearData(-1);
}
-
+
/**
* Sets the cache block reference to null, abandons the old block.
* Makes the "envelope" empty. Run it to finalize the object (otherwise the
* evicted cache block file may remain undeleted).
- *
+ *
* In-Status: EMPTY, EVICTABLE, EVICTED;
* Out-Status: EMPTY.
- *
+ *
* @param tid thread ID
- *
+ *
*/
- public synchronized void clearData(long tid)
+ public synchronized void clearData(long tid)
{
- // check if cleanup enabled and possible
- if( !isCleanupEnabled() )
+ // check if cleanup enabled and possible
+ if( !isCleanupEnabled() )
return; // do nothing
if( !isAvailableToModify() )
throw new DMLRuntimeException("CacheableData (" + getDebugName() + ") not available to "
+ "modify. Status = " + _cacheStatus.name() + ".");
-
+
// clear existing WB / FS representation (but prevent unnecessary probes)
- if( !(isEmpty(true)||(_data!=null && isBelowCachingThreshold())
+ if( !(isEmpty(true)||(_data!=null && isBelowCachingThreshold())
||(_data!=null && !isCachingActive()) )) //additional condition for JMLC
freeEvictedBlob();
@@ -833,7 +806,7 @@ public synchronized void clearData(long tid)
_data = null;
clearCache();
setCacheLineage(null);
-
+
// clear rdd/broadcast back refs
if( _rddHandle != null )
_rddHandle.setBackReference(null);
@@ -845,11 +818,11 @@ public synchronized void clearData(long tid)
gObj.clearData(null, DMLScript.EAGER_CUDA_FREE);
}
}
-
+
//clear federated matrix
if( _fedMapping != null )
_fedMapping.execCleanup(tid, _fedMapping.getID());
-
+
// change object state EMPTY
setDirty(false);
setEmpty();
@@ -858,13 +831,13 @@ public synchronized void clearData(long tid)
public synchronized void exportData() {
exportData( -1 );
}
-
+
/**
* Writes, or flushes, the cache block data to HDFS.
- *
+ *
* In-Status: EMPTY, EVICTABLE, EVICTED, READ;
* Out-Status: EMPTY, EVICTABLE, EVICTED, READ.
- *
+ *
* @param replication ?
*/
public synchronized void exportData( int replication ) {
@@ -878,18 +851,18 @@ public synchronized void exportData(String fName, String outputFormat) {
public synchronized void exportData(String fName, String outputFormat, FileFormatProperties formatProperties) {
exportData(fName, outputFormat, -1, formatProperties);
}
-
+
/**
* Synchronized because there might be parallel threads (parfor local) that
* access the same object (in case it was created before the loop).
* If all threads export the same data object concurrently it results in errors
* because they all write to the same file. Efficiency for loops and parallel threads
* is achieved by checking if the in-memory block is dirty.
- *
+ *
* NOTE: MB: we do not use dfs copy from local (evicted) to HDFS because this would ignore
* the output format and most importantly would bypass reblocking during write (which effects the
- * potential degree of parallelism). However, we copy files on HDFS if certain criteria are given.
- *
+ * potential degree of parallelism). However, we copy files on HDFS if certain criteria are given.
+ *
* @param fName file name
* @param outputFormat format
* @param replication ?
@@ -905,7 +878,7 @@ public synchronized void exportData (String fName, String outputFormat, int repl
if( LOG.isTraceEnabled() )
LOG.trace("Exporting " + this.getDebugName() + " to " + fName + " in format " + outputFormat);
-
+
if( DMLScript.USE_ACCELERATOR && _gpuObjects != null ) {
boolean copiedFromGPU = false;
for (Map.Entry kv : _gpuObjects.entrySet()) {
@@ -919,12 +892,12 @@ public synchronized void exportData (String fName, String outputFormat, int repl
}
}
}
-
+
//check for persistent or transient writes
boolean pWrite = !fName.equals(_hdfsFileName);
if( !pWrite )
setHDFSFileExists(true);
-
+
//check for common file scheme (otherwise no copy/rename)
int blen = (formatProperties == null) ?
ConfigurationManager.getBlocksize() : formatProperties.getBlocksize();
@@ -933,7 +906,7 @@ public synchronized void exportData (String fName, String outputFormat, int repl
boolean eqFormat = isEqualOutputFormat(outputFormat);
boolean eqBlksize = (getBlocksize() != blen)
&& (outputFormat == null || outputFormat.equals("binary"));
-
+
//actual export (note: no direct transfer of local copy in order to ensure blocking (and hence, parallelism))
if( isDirty() || !eqScheme || isFederated() ||
(pWrite && (!eqFormat | !eqBlksize)) )
@@ -957,7 +930,7 @@ public synchronized void exportData (String fName, String outputFormat, int repl
if( isEmpty(true) && !federatedWrite)
{
//read data from HDFS if required (never read before), this applies only to pWrite w/ different output formats
- //note: for large rdd outputs, we compile dedicated writespinstructions (no need to handle this here)
+ //note: for large rdd outputs, we compile dedicated writespinstructions (no need to handle this here)
try {
if( getRDDHandle()==null || getRDDHandle().allowsShortCircuitRead() )
_data = readBlobFromHDFS( _hdfsFileName );
@@ -972,15 +945,15 @@ else if(!federatedWrite)
throw new DMLRuntimeException("Reading of " + _hdfsFileName + " ("+hashCode()+") failed.", e);
}
}
-
+
//get object from cache
if(!federatedWrite) {
if( _data == null )
getCache();
acquire( false, _data==null ); //incl. read matrix if evicted
}
-
- // b) write the matrix
+
+ // b) write the matrix
try {
writeMetaData( fName, outputFormat, formatProperties );
writeBlobToHDFS( fName, outputFormat, replication, formatProperties );
@@ -1014,7 +987,7 @@ else if( pWrite ) // pwrite with same output format
}
}
else if( getRDDHandle()!=null && getRDDHandle().isPending()
- && !getRDDHandle().isHDFSFile()
+ && !getRDDHandle().isHDFSFile()
&& !getRDDHandle().allowsShortCircuitRead() )
{
//CASE 3: pending rdd operation (other than checkpoints)
@@ -1031,25 +1004,25 @@ else if( getRDDHandle()!=null && getRDDHandle().isPending()
throw new DMLRuntimeException("Export to " + fName + " failed.", e);
}
}
- else
+ else
{
//CASE 4: data already in hdfs (do nothing, no need for export)
if( LOG.isTraceEnabled() )
LOG.trace(this.getDebugName() + ": Skip export to hdfs since data already exists.");
}
-
+
_hdfsFileExists = true;
if( DMLScript.STATISTICS ){
long t1 = System.nanoTime();
CacheStatistics.incrementExportTime(t1-t0);
}
}
-
+
// --------- ABSTRACT LOW-LEVEL CACHE I/O OPERATIONS ----------
/**
* Checks if the data blob reference points to some in-memory object.
- * This method is called when releasing the (last) lock. Do not call
+ * This method is called when releasing the (last) lock. Do not call
* this method for a blob that has been evicted.
*
* @return true if the blob is in main memory and the
@@ -1068,11 +1041,11 @@ protected boolean isBlobPresent() {
protected void restoreBlobIntoMemory() {
String cacheFilePathAndName = getCacheFilePathAndName();
long begin = LOG.isTraceEnabled() ? System.currentTimeMillis() : 0;
-
+
if( LOG.isTraceEnabled() )
- LOG.trace ("CACHE: Restoring matrix... " + hashCode() + " HDFS path: " +
+ LOG.trace ("CACHE: Restoring matrix... " + hashCode() + " HDFS path: " +
(_hdfsFileName == null ? "null" : _hdfsFileName) + ", Restore from path: " + cacheFilePathAndName);
-
+
if (_data != null)
throw new DMLRuntimeException(cacheFilePathAndName + " : Cannot restore on top of existing in-memory data.");
@@ -1080,20 +1053,20 @@ protected void restoreBlobIntoMemory() {
_data = readBlobFromCache(cacheFilePathAndName);
}
catch (IOException e) {
- throw new DMLRuntimeException(cacheFilePathAndName + " : Restore failed.", e);
+ throw new DMLRuntimeException(cacheFilePathAndName + " : Restore failed.", e);
}
-
+
//check for success
if (_data == null)
throw new DMLRuntimeException (cacheFilePathAndName + " : Restore failed.");
-
+
if( LOG.isTraceEnabled() )
LOG.trace("Restoring matrix - COMPLETED ... " + (System.currentTimeMillis()-begin) + " msec.");
}
protected abstract T readBlobFromCache(String fname)
throws IOException;
-
+
/**
* Low-level cache I/O method that deletes the file containing the
* evicted data blob, without reading it.
@@ -1103,16 +1076,16 @@ public final void freeEvictedBlob() {
String cacheFilePathAndName = getCacheFilePathAndName();
long begin = LOG.isTraceEnabled() ? System.currentTimeMillis() : 0;
if( LOG.isTraceEnabled() )
- LOG.trace("CACHE: Freeing evicted matrix... " + hashCode() + " HDFS path: " +
+ LOG.trace("CACHE: Freeing evicted matrix... " + hashCode() + " HDFS path: " +
(_hdfsFileName == null ? "null" : _hdfsFileName) + " Eviction path: " + cacheFilePathAndName);
-
+
if(isCachingActive()) {
if (OptimizerUtils.isUMMEnabled())
UnifiedMemoryManager.deleteBlock(cacheFilePathAndName);
else
LazyWriteBuffer.deleteBlock(cacheFilePathAndName);
}
-
+
if( LOG.isTraceEnabled() )
LOG.trace("Freeing evicted matrix - COMPLETED ... " + (System.currentTimeMillis()-begin) + " msec.");
}
@@ -1120,7 +1093,7 @@ public final void freeEvictedBlob() {
protected boolean isBelowCachingThreshold() {
return (_data.getInMemorySize() <= CACHING_THRESHOLD);
}
-
+
public static boolean isBelowCachingThreshold(CacheBlock> data) {
boolean ret;
if (OptimizerUtils.isUMMEnabled())
@@ -1129,11 +1102,11 @@ public static boolean isBelowCachingThreshold(CacheBlock> data) {
ret = LazyWriteBuffer.getCacheBlockSize(data) <= CACHING_THRESHOLD;
return ret;
}
-
+
public long getDataSize() {
return (_data != null) ?_data.getInMemorySize() : 0;
}
-
+
protected ValueType[] getSchema() {
return null;
}
@@ -1141,8 +1114,8 @@ protected ValueType[] getSchema() {
@Override //Data
public synchronized String getDebugName() {
int maxLength = 23;
- String debugNameEnding = (_hdfsFileName == null ? "null" :
- (_hdfsFileName.length() < maxLength ? _hdfsFileName : "..." +
+ String debugNameEnding = (_hdfsFileName == null ? "null" :
+ (_hdfsFileName.length() < maxLength ? _hdfsFileName : "..." +
_hdfsFileName.substring (_hdfsFileName.length() - maxLength + 3)));
return hashCode() + " " + debugNameEnding;
}
@@ -1172,7 +1145,7 @@ protected T readBlobFromFederated(FederationMap fedMap) throws IOException {
DataCharacteristics dc = iimd.getDataCharacteristics();
return readBlobFromFederated(fedMap, dc.getDims());
}
-
+
protected abstract T readBlobFromFederated(FederationMap fedMap, long[] dims)
throws IOException;
@@ -1181,22 +1154,22 @@ protected abstract void writeBlobToHDFS(String fname, String ofmt, int rep, File
protected abstract long writeStreamToHDFS(String fname, String ofmt, int rep, FileFormatProperties fprop)
throws IOException;
-
+
protected abstract void writeBlobFromRDDtoHDFS(RDDObject rdd, String fname, String ofmt)
throws IOException;
protected abstract T reconstructByLineage(LineageItem li)
throws IOException;
-
+
protected void writeMetaData (String filePathAndName, String outputFormat, FileFormatProperties formatProperties)
throws IOException
- {
+ {
MetaDataFormat iimd = (MetaDataFormat) _metaData;
-
+
if (iimd == null)
throw new DMLRuntimeException("Unexpected error while writing mtd file (" + filePathAndName + ") -- metadata is null.");
-
+
// Write the matrix to HDFS in requested format
FileFormat fmt = (outputFormat != null) ? FileFormat.safeValueOf(outputFormat) : iimd.getFileFormat();
if ( fmt != FileFormat.MM ) {
@@ -1204,15 +1177,15 @@ protected void writeMetaData (String filePathAndName, String outputFormat, FileF
DataCharacteristics dc = iimd.getDataCharacteristics();
if( formatProperties != null && formatProperties.knownBlocksize() )
dc.setBlocksize(formatProperties.getBlocksize());
-
+
// when outputFormat is binaryblock, make sure that matrixCharacteristics has correct blocking dimensions
- // note: this is only required if singlenode (due to binarycell default)
+ // note: this is only required if singlenode (due to binarycell default)
if ( fmt == FileFormat.BINARY && DMLScript.getGlobalExecMode() == ExecMode.SINGLE_NODE
&& dc.getBlocksize() != ConfigurationManager.getBlocksize() )
{
dc = new MatrixCharacteristics(dc.getRows(), dc.getCols(), dc.getBlocksize(), dc.getNonZeros());
}
-
+
//write the actual meta data file
HDFSTool.writeMetaDataFile (filePathAndName + ".mtd",
valueType, getSchema(), dataType, dc, fmt, formatProperties);
@@ -1226,9 +1199,9 @@ protected boolean isEqualOutputFormat(String outputFormat) {
}
return true;
}
-
+
// ------------- IMPLEMENTED CACHE LOGIC METHODS --------------
-
+
protected String getCacheFilePathAndName () {
if( _cacheFileName==null ) {
StringBuilder sb = new StringBuilder();
@@ -1238,15 +1211,15 @@ protected String getCacheFilePathAndName () {
sb.append(CacheableData.CACHING_EVICTION_FILEEXTENSION);
_cacheFileName = sb.toString();
}
-
+
return _cacheFileName;
}
-
+
/**
* This method "acquires the lock" to ensure that the data blob is in main memory
* (not evicted) while it is being accessed. When called, the method will try to
* restore the blob if it has been evicted. There are two kinds of locks it may
- * acquire: a shared "read" lock (if the argument is false) or the
+ * acquire: a shared "read" lock (if the argument is false) or the
* exclusive "modify" lock (if the argument is true).
* The method can fail in three ways:
* (1) if there is lock status conflict;
@@ -1256,9 +1229,9 @@ protected String getCacheFilePathAndName () {
* its last-access timestamp. For the shared "read" lock, acquiring a new lock
* increments the associated count. The "read" count has to be decremented once
* the blob is no longer used, which may re-enable eviction. This method has to
- * be called only once per matrix operation and coupled with {@link #release()},
+ * be called only once per matrix operation and coupled with {@link #release()},
* because it increments the lock count and the other method decrements this count.
- *
+ *
* @param isModify : true for the exclusive "modify" lock,
* false for a shared "read" lock.
* @param restore true if restore
@@ -1290,7 +1263,7 @@ protected void acquire (boolean isModify, boolean restore) {
LOG.trace("Acquired lock on " + getDebugName() + ", status: " + _cacheStatus.name() );
}
-
+
/**
* Call this method to permit eviction for the stored data blob, or to
* decrement its "read" count if it is "read"-locked by other threads.
@@ -1300,7 +1273,7 @@ protected void acquire (boolean isModify, boolean restore) {
* called only once per process and coupled with {@link #acquire(boolean, boolean)},
* because it decrements the lock count and the other method increments
* the lock count.
- *
+ *
* @param cacheNoWrite ?
*/
protected void release(boolean cacheNoWrite)
@@ -1321,37 +1294,37 @@ protected void release(boolean cacheNoWrite)
setEmpty();
break;
}
-
+
if( LOG.isTraceEnabled() )
LOG.trace("Released lock on " + getDebugName() + ", status: " + _cacheStatus.name());
-
+
}
-
+
// **************************************************
// *** ***
// *** CACHE STATUS FIELD - CLASSES AND METHODS ***
// *** ***
// **************************************************
-
+
public boolean isCached(boolean inclCachedNoWrite) {
return _cacheStatus == CacheStatus.CACHED
|| (inclCachedNoWrite && _cacheStatus == CacheStatus.CACHED_NOWRITE);
}
-
+
public void setEmptyStatus() {
setEmpty();
}
-
+
protected boolean isEmpty(boolean inclCachedNoWrite) {
return _cacheStatus == CacheStatus.EMPTY
|| (inclCachedNoWrite && _cacheStatus == CacheStatus.CACHED_NOWRITE);
}
-
+
protected boolean isModify() {
return (_cacheStatus == CacheStatus.MODIFY);
}
-
+
public boolean isPendingRDDOps() {
return isEmpty(true) && _data == null && (_rddHandle != null && _rddHandle.hasBackReference());
}
@@ -1364,11 +1337,11 @@ public boolean isDeviceToHostCopy() {
protected void setEmpty() {
_cacheStatus = CacheStatus.EMPTY;
}
-
+
protected void setModify() {
_cacheStatus = CacheStatus.MODIFY;
}
-
+
protected void setCached() {
_cacheStatus = CacheStatus.CACHED;
}
@@ -1377,25 +1350,25 @@ protected void addOneRead() {
_numReadThreads ++;
_cacheStatus = CacheStatus.READ;
}
-
+
protected void removeOneRead(boolean doesBlobExist, boolean cacheNoWrite) {
_numReadThreads --;
if (_numReadThreads == 0) {
if( cacheNoWrite )
- _cacheStatus = (doesBlobExist ?
+ _cacheStatus = (doesBlobExist ?
CacheStatus.CACHED_NOWRITE : CacheStatus.EMPTY);
else
- _cacheStatus = (doesBlobExist ?
+ _cacheStatus = (doesBlobExist ?
CacheStatus.CACHED : CacheStatus.EMPTY);
}
}
-
+
protected boolean isAvailableToRead() {
return (_cacheStatus != CacheStatus.MODIFY);
}
-
+
protected boolean isAvailableToModify() {
- return ( _cacheStatus == CacheStatus.EMPTY
+ return ( _cacheStatus == CacheStatus.EMPTY
|| _cacheStatus == CacheStatus.CACHED
|| _cacheStatus == CacheStatus.CACHED_NOWRITE);
}
@@ -1406,10 +1379,10 @@ protected boolean isAvailableToModify() {
// *** FOR SOFTREFERENCE CACHE ***
// *** ***
// *******************************************
-
+
/**
* Creates a new cache soft reference to the currently
- * referenced cache block.
+ * referenced cache block.
*/
protected void createCache( ) {
if( _cache == null || _cache.get() == null )
@@ -1425,7 +1398,7 @@ protected void getCache() {
_data = _cache.get();
}
}
-
+
/** Clears the cache soft reference if existing. */
protected void clearCache() {
if( _cache != null ) {
@@ -1445,39 +1418,39 @@ protected void updateStatusPinned(boolean add) {
protected static long getPinnedSize() {
return sizePinned.get();
}
-
+
public static void addBroadcastSize(long size) {
_refBCs.addAndGet(size);
}
-
+
public static long getBroadcastSize() {
//scale the total sum of all broadcasts by the current fraction
//of local memory to equally distribute it across parfor workers
return (long) (_refBCs.longValue() *
InfrastructureAnalyzer.getLocalMaxMemoryFraction());
}
-
+
// --------- STATIC CACHE INIT/CLEANUP OPERATIONS ----------
public synchronized static void cleanupCacheDir() {
//cleanup remaining cached writes
LazyWriteBuffer.cleanup();
UnifiedMemoryManager.cleanup();
-
+
//delete cache dir and files
cleanupCacheDir(true);
}
-
+
/**
* Deletes the DML-script-specific caching working dir.
- *
+ *
* @param withDir if true, delete directory
*/
public synchronized static void cleanupCacheDir(boolean withDir)
{
//get directory name
String dir = cacheEvictionLocalFilePath;
-
+
//clean files with cache prefix
if( dir != null ) //if previous init cache
{
@@ -1491,30 +1464,30 @@ public synchronized static void cleanupCacheDir(boolean withDir)
fdir.delete(); //deletes dir only if empty
}
}
-
+
_activeFlag = false;
}
-
+
/**
* Inits caching with the default uuid of DMLScript
- *
+ *
* @throws IOException if IOException occurs
*/
- public synchronized static void initCaching()
+ public synchronized static void initCaching()
throws IOException
{
initCaching(DMLScript.getUUID());
}
-
+
/**
* Creates the DML-script-specific caching working dir.
- *
+ *
* Takes the UUID in order to allow for custom uuid, e.g., for remote parfor caching
- *
+ *
* @param uuid ID
* @throws IOException if IOException occurs
*/
- public synchronized static void initCaching( String uuid )
+ public synchronized static void initCaching( String uuid )
throws IOException
{
try
@@ -1527,7 +1500,7 @@ public synchronized static void initCaching( String uuid )
{
throw new IOException(e);
}
-
+
if (OptimizerUtils.isUMMEnabled())
//init unified memory manager
UnifiedMemoryManager.init();
@@ -1542,26 +1515,26 @@ public synchronized static void initCaching( String uuid )
public static boolean isCachingActive() {
return _activeFlag;
}
-
+
public static void disableCaching() {
_activeFlag = false;
}
-
+
public static void enableCaching() {
_activeFlag = true;
}
public synchronized boolean moveData(String fName, String outputFormat) {
boolean ret = false;
-
+
try
{
//check for common file scheme (otherwise no copy/rename)
boolean eqScheme = IOUtilFunctions.isSameFileScheme(
new Path(_hdfsFileName), new Path(fName));
-
+
//export or rename to target file on hdfs
- if( isDirty() || !eqScheme || (!isEqualOutputFormat(outputFormat) && isEmpty(true))
+ if( isDirty() || !eqScheme || (!isEqualOutputFormat(outputFormat) && isEmpty(true))
|| (getRDDHandle()!=null && !HDFSTool.existsFileOnHDFS(_hdfsFileName)) )
{
exportData(fName, outputFormat);
@@ -1579,7 +1552,7 @@ else if( isEqualOutputFormat(outputFormat) )
catch (Exception e) {
throw new DMLRuntimeException("Move to " + fName + " failed.", e);
}
-
+
return ret;
}
@@ -1587,7 +1560,7 @@ else if( isEqualOutputFormat(outputFormat) )
public String toString() {
return toString(false);
}
-
+
@Override
public String toString(boolean metaOnly) {
StringBuilder str = new StringBuilder();
diff --git a/src/main/java/org/apache/sysds/runtime/instructions/CPInstructionParser.java b/src/main/java/org/apache/sysds/runtime/instructions/CPInstructionParser.java
index 92e11b425dd..b44e06ad2d0 100644
--- a/src/main/java/org/apache/sysds/runtime/instructions/CPInstructionParser.java
+++ b/src/main/java/org/apache/sysds/runtime/instructions/CPInstructionParser.java
@@ -82,92 +82,92 @@ public static CPInstruction parseSingleInstruction (String str ) {
throw new DMLRuntimeException("Unable to parse instruction: " + str);
return cpinst;
}
-
+
public static CPInstruction parseSingleInstruction ( InstructionType cptype, String str ) {
ExecType execType;
- if ( str == null || str.isEmpty() )
+ if ( str == null || str.isEmpty() )
return null;
switch(cptype) {
case AggregateUnary:
return AggregateUnaryCPInstruction.parseInstruction(str);
-
+
case AggregateBinary:
return AggregateBinaryCPInstruction.parseInstruction(str);
-
+
case AggregateTernary:
return AggregateTernaryCPInstruction.parseInstruction(str);
-
+
case Unary:
return UnaryCPInstruction.parseInstruction(str);
case Binary:
return BinaryCPInstruction.parseInstruction(str);
-
+
case Ternary:
return TernaryCPInstruction.parseInstruction(str);
-
+
case Quaternary:
return QuaternaryCPInstruction.parseInstruction(str);
-
+
case BuiltinNary:
return BuiltinNaryCPInstruction.parseInstruction(str);
-
+
case Ctable:
return CtableCPInstruction.parseInstruction(str);
-
+
case Reorg:
return ReorgCPInstruction.parseInstruction(str);
-
+
case Dnn:
return DnnCPInstruction.parseInstruction(str);
-
+
case UaggOuterChain:
return UaggOuterChainCPInstruction.parseInstruction(str);
-
+
case Reshape:
return ReshapeCPInstruction.parseInstruction(str);
-
+
case Append:
return AppendCPInstruction.parseInstruction(str);
-
+
case Variable:
return VariableCPInstruction.parseInstruction(str);
-
+
case Rand:
return DataGenCPInstruction.parseInstruction(str);
case StringInit:
return StringInitCPInstruction.parseInstruction(str);
-
+
case FCall:
return FunctionCallCPInstruction.parseInstruction(str);
case ParameterizedBuiltin:
return ParameterizedBuiltinCPInstruction.parseInstruction(str);
-
+
case MultiReturnParameterizedBuiltin:
return MultiReturnParameterizedBuiltinCPInstruction.parseInstruction(str);
-
+
case MultiReturnComplexMatrixBuiltin:
return MultiReturnComplexMatrixBuiltinCPInstruction.parseInstruction(str);
-
+
case MultiReturnBuiltin:
return MultiReturnBuiltinCPInstruction.parseInstruction(str);
-
+
case QSort:
return QuantileSortCPInstruction.parseInstruction(str);
-
+
case QPick:
return QuantilePickCPInstruction.parseInstruction(str);
-
+
case MatrixIndexing:
- execType = ExecType.valueOf( str.split(Instruction.OPERAND_DELIM)[0] );
+ execType = ExecType.valueOf( str.split(Instruction.OPERAND_DELIM)[0] );
if( execType == ExecType.CP )
return IndexingCPInstruction.parseInstruction(str);
else //exectype CP_FILE
return MatrixIndexingCPFileInstruction.parseInstruction(str);
-
- case Builtin:
+
+ case Builtin:
String[] parts = InstructionUtils.getInstructionPartsWithValueType(str);
if(parts[0].equals(Opcodes.LOG.toString()) || parts[0].equals(Opcodes.LOGNZ.toString())) {
if(InstructionUtils.isInteger(parts[3])) // B=log(A), y=log(x)
@@ -177,44 +177,44 @@ public static CPInstruction parseSingleInstruction ( InstructionType cptype, Str
return BinaryCPInstruction.parseInstruction(str);
}
throw new DMLRuntimeException("Invalid Builtin Instruction: " + str );
-
+
case MMTSJ:
return MMTSJCPInstruction.parseInstruction(str);
-
+
case PMMJ:
return PMMJCPInstruction.parseInstruction(str);
-
+
case MMChain:
return MMChainCPInstruction.parseInstruction(str);
-
+
case CentralMoment:
return CentralMomentCPInstruction.parseInstruction(str);
-
+
case Covariance:
return CovarianceCPInstruction.parseInstruction(str);
case Compression:
return CompressionCPInstruction.parseInstruction(str);
-
+
case DeCompression:
return DeCompressionCPInstruction.parseInstruction(str);
-
+
case QuantizeCompression:
LOG.debug("Parsing Quantize Compress instruction");
- return CompressionCPInstruction.parseQuantizationFusedInstruction(str);
+ return CompressionCPInstruction.parseQuantizationFusedInstruction(str);
case Local:
return LocalCPInstruction.parseInstruction(str);
case SpoofFused:
return SpoofCPInstruction.parseInstruction(str);
-
+
case Sql:
return SqlCPInstruction.parseInstruction(str);
-
+
case Prefetch:
return PrefetchCPInstruction.parseInstruction(str);
-
+
case Broadcast:
return BroadcastCPInstruction.parseInstruction(str);
@@ -223,10 +223,10 @@ public static CPInstruction parseSingleInstruction ( InstructionType cptype, Str
case Union:
return UnionCPInstruction.parseInstruction(str);
-
+
case EINSUM:
return EinsumCPInstruction.parseInstruction(str);
-
+
default:
throw new DMLRuntimeException("Invalid CP Instruction Type: " + cptype );
}
diff --git a/src/main/java/org/apache/sysds/runtime/instructions/OOCInstructionParser.java b/src/main/java/org/apache/sysds/runtime/instructions/OOCInstructionParser.java
index f23ad6d67a6..8b64073111c 100644
--- a/src/main/java/org/apache/sysds/runtime/instructions/OOCInstructionParser.java
+++ b/src/main/java/org/apache/sysds/runtime/instructions/OOCInstructionParser.java
@@ -38,6 +38,8 @@
import org.apache.sysds.runtime.instructions.ooc.MatrixVectorBinaryOOCInstruction;
import org.apache.sysds.runtime.instructions.ooc.TransposeOOCInstruction;
import org.apache.sysds.runtime.instructions.ooc.TeeOOCInstruction;
+import org.apache.sysds.runtime.instructions.ooc.OOCInstruction;
+import org.apache.sysds.runtime.instructions.ooc.ReblockOOCInstruction;
public class OOCInstructionParser extends InstructionParser {
protected static final Log LOG = LogFactory.getLog(OOCInstructionParser.class.getName());
@@ -78,7 +80,7 @@ public static OOCInstruction parseSingleInstruction(InstructionType ooctype, Str
case Tee:
return TeeOOCInstruction.parseInstruction(str);
case CentralMoment:
- return CentralMomentOOCInstruction.parseInstruction(str);
+ return CentralMomentOOCInstruction.parseInstruction(str);
case Ctable:
return CtableOOCInstruction.parseInstruction(str);
case ParameterizedBuiltin:
diff --git a/src/main/java/org/apache/sysds/runtime/instructions/ooc/AggregateUnaryOOCInstruction.java b/src/main/java/org/apache/sysds/runtime/instructions/ooc/AggregateUnaryOOCInstruction.java
index 2a53c5400ae..34df0f4d249 100644
--- a/src/main/java/org/apache/sysds/runtime/instructions/ooc/AggregateUnaryOOCInstruction.java
+++ b/src/main/java/org/apache/sysds/runtime/instructions/ooc/AggregateUnaryOOCInstruction.java
@@ -30,37 +30,27 @@
import org.apache.sysds.runtime.instructions.cp.DoubleObject;
import org.apache.sysds.runtime.instructions.spark.data.IndexedMatrixValue;
import org.apache.sysds.runtime.matrix.data.MatrixBlock;
-import org.apache.sysds.runtime.matrix.data.MatrixIndexes;
import org.apache.sysds.runtime.matrix.data.OperationsOnMatrixValues;
import org.apache.sysds.runtime.matrix.operators.AggregateOperator;
import org.apache.sysds.runtime.matrix.operators.AggregateUnaryOperator;
-import org.apache.sysds.runtime.matrix.operators.Operator;
-import org.apache.sysds.runtime.meta.DataCharacteristics;
-import java.util.HashMap;
public class AggregateUnaryOOCInstruction extends ComputationOOCInstruction {
private AggregateOperator _aop = null;
- protected AggregateUnaryOOCInstruction(OOCType type, AggregateUnaryOperator auop, AggregateOperator aop,
+ protected AggregateUnaryOOCInstruction(OOCType type, AggregateUnaryOperator auop, AggregateOperator aop,
CPOperand in, CPOperand out, String opcode, String istr) {
super(type, auop, in, out, opcode, istr);
_aop = aop;
}
- protected AggregateUnaryOOCInstruction(OOCType type, Operator op, CPOperand in1, CPOperand in2, CPOperand in3,
- CPOperand out, String opcode, String istr) {
- super(type, op, in1, in2, in3, out, opcode, istr);
- _aop = null;
- }
-
public static AggregateUnaryOOCInstruction parseInstruction(String str) {
String[] parts = InstructionUtils.getInstructionPartsWithValueType(str);
InstructionUtils.checkNumFields(parts, 2);
String opcode = parts[0];
CPOperand in1 = new CPOperand(parts[1]);
CPOperand out = new CPOperand(parts[2]);
-
+
String aopcode = InstructionUtils.deriveAggregateOperatorOpcode(opcode);
CorrectionLocationType corrLoc = InstructionUtils.deriveAggregateOperatorCorrectionLocation(opcode);
AggregateUnaryOperator aggun = InstructionUtils.parseBasicAggregateUnaryOperator(opcode);
@@ -68,112 +58,37 @@ public static AggregateUnaryOOCInstruction parseInstruction(String str) {
return new AggregateUnaryOOCInstruction(
OOCType.AggregateUnary, aggun, aop, in1, out, opcode, str);
}
-
+
@Override
public void processInstruction( ExecutionContext ec ) {
- //TODO support all types of aggregations, currently only full aggregation, row aggregation and column aggregation
-
+ //TODO support all types of aggregations, currently only full aggregation
+
//setup operators and input queue
- AggregateUnaryOperator aggun = (AggregateUnaryOperator) getOperator();
+ AggregateUnaryOperator aggun = (AggregateUnaryOperator) getOperator();
MatrixObject min = ec.getMatrixObject(input1);
- OOCStream q = min.getStreamHandle();
+ LocalTaskQueue q = min.getStreamHandle();
+ IndexedMatrixValue tmp = null;
int blen = ConfigurationManager.getBlocksize();
- if (aggun.isRowAggregate() || aggun.isColAggregate()) {
- DataCharacteristics chars = ec.getDataCharacteristics(input1.getName());
- // number of blocks to process per aggregation idx (row or column dim)
- long emitThreshold = aggun.isRowAggregate()? chars.getNumColBlocks() : chars.getNumRowBlocks();
- OOCMatrixBlockTracker aggTracker = new OOCMatrixBlockTracker(emitThreshold);
- HashMap corrs = new HashMap<>(); // correction blocks
-
- OOCStream qOut = createWritableStream();
- ec.getMatrixObject(output).setStreamHandle(qOut);
-
- submitOOCTask(() -> {
- IndexedMatrixValue tmp = null;
- try {
- while((tmp = q.dequeue()) != LocalTaskQueue.NO_MORE_TASKS) {
- long idx = aggun.isRowAggregate() ?
- tmp.getIndexes().getRowIndex() : tmp.getIndexes().getColumnIndex();
- MatrixBlock ret = aggTracker.get(idx);
- if(ret != null) {
- MatrixBlock corr = corrs.get(idx);
-
- // aggregation
- MatrixBlock ltmp = (MatrixBlock) ((MatrixBlock) tmp.getValue())
- .aggregateUnaryOperations(aggun, new MatrixBlock(), blen, tmp.getIndexes());
- OperationsOnMatrixValues.incrementalAggregation(ret,
- _aop.existsCorrection() ? corr : null, ltmp, _aop, true);
-
- if (!aggTracker.putAndIncrementCount(idx, ret)){
- corrs.replace(idx, corr);
- continue;
- }
- }
- else {
- // first block for this idx - init aggregate and correction
- // TODO avoid corr block for inplace incremental aggregation
- int rows = tmp.getValue().getNumRows();
- int cols = tmp.getValue().getNumColumns();
- int extra = _aop.correction.getNumRemovedRowsColumns();
- ret = aggun.isRowAggregate()? new MatrixBlock(rows, 1 + extra, false) : new MatrixBlock(1 + extra, cols, false);
- MatrixBlock corr = aggun.isRowAggregate()? new MatrixBlock(rows, 1 + extra, false) : new MatrixBlock(1 + extra, cols, false);
-
- // aggregation
- MatrixBlock ltmp = (MatrixBlock) ((MatrixBlock) tmp.getValue()).aggregateUnaryOperations(
- aggun, new MatrixBlock(), blen, tmp.getIndexes());
- OperationsOnMatrixValues.incrementalAggregation(ret,
- _aop.existsCorrection() ? corr : null, ltmp, _aop, true);
-
- if(emitThreshold > 1){
- aggTracker.putAndIncrementCount(idx, ret);
- corrs.put(idx, corr);
- continue;
- }
- }
-
- // all input blocks for this idx processed - emit aggregated block
- ret.dropLastRowsOrColumns(_aop.correction);
- MatrixIndexes midx = aggun.isRowAggregate() ?
- new MatrixIndexes(tmp.getIndexes().getRowIndex(), 1) :
- new MatrixIndexes(1, tmp.getIndexes().getColumnIndex());
- IndexedMatrixValue tmpOut = new IndexedMatrixValue(midx, ret);
-
- qOut.enqueue(tmpOut);
- // drop intermediate states
- aggTracker.remove(idx);
- corrs.remove(idx);
- }
- qOut.closeInput();
- }
- catch(Exception ex) {
- throw new DMLRuntimeException(ex);
- }
- }, q, qOut);
- }
- // full aggregation
- else {
- IndexedMatrixValue tmp = null;
- //read blocks and aggregate immediately into result
- int extra = _aop.correction.getNumRemovedRowsColumns();
- MatrixBlock ret = new MatrixBlock(1,1+extra,false);
- MatrixBlock corr = new MatrixBlock(1,1+extra,false);
- try {
- while((tmp = q.dequeue()) != LocalTaskQueue.NO_MORE_TASKS) {
- //block aggregation
- MatrixBlock ltmp = (MatrixBlock) ((MatrixBlock) tmp.getValue())
- .aggregateUnaryOperations(aggun, new MatrixBlock(), blen, tmp.getIndexes());
- //accumulation into final result
- OperationsOnMatrixValues.incrementalAggregation(
- ret, _aop.existsCorrection() ? corr : null, ltmp, _aop, true);
- }
+ //read blocks and aggregate immediately into result
+ int extra = _aop.correction.getNumRemovedRowsColumns();
+ MatrixBlock ret = new MatrixBlock(1,1+extra,false);
+ MatrixBlock corr = new MatrixBlock(1,1+extra,false);
+ try {
+ while((tmp = q.dequeueTask()) != LocalTaskQueue.NO_MORE_TASKS) {
+ //block aggregation
+ MatrixBlock ltmp = (MatrixBlock) ((MatrixBlock) tmp.getValue())
+ .aggregateUnaryOperations(aggun, new MatrixBlock(), blen, tmp.getIndexes());
+ //accumulation into final result
+ OperationsOnMatrixValues.incrementalAggregation(
+ ret, _aop.existsCorrection() ? corr : null, ltmp, _aop, true);
}
- catch(Exception ex) {
- throw new DMLRuntimeException(ex);
- }
-
- //create scalar output
- ec.setScalarOutput(output.getName(), new DoubleObject(ret.get(0, 0)));
}
+ catch(Exception ex) {
+ throw new DMLRuntimeException(ex);
+ }
+
+ //create scalar output
+ ec.setScalarOutput(output.getName(), new DoubleObject(ret.get(0, 0)));
}
}
diff --git a/src/main/java/org/apache/sysds/runtime/instructions/ooc/ComputationOOCInstruction.java b/src/main/java/org/apache/sysds/runtime/instructions/ooc/ComputationOOCInstruction.java
index 4dcdffcb0dc..bc5a4d841b4 100644
--- a/src/main/java/org/apache/sysds/runtime/instructions/ooc/ComputationOOCInstruction.java
+++ b/src/main/java/org/apache/sysds/runtime/instructions/ooc/ComputationOOCInstruction.java
@@ -33,7 +33,7 @@ protected ComputationOOCInstruction(OOCType type, Operator op, CPOperand in1, CP
input3 = null;
output = out;
}
-
+
protected ComputationOOCInstruction(OOCType type, Operator op, CPOperand in1, CPOperand in2, CPOperand out, String opcode, String istr) {
super(type, op, opcode, istr);
input1 = in1;
diff --git a/src/main/java/org/apache/sysds/runtime/instructions/ooc/ReblockOOCInstruction.java b/src/main/java/org/apache/sysds/runtime/instructions/ooc/ReblockOOCInstruction.java
index 74b15c9fb0e..1f7fce3b146 100644
--- a/src/main/java/org/apache/sysds/runtime/instructions/ooc/ReblockOOCInstruction.java
+++ b/src/main/java/org/apache/sysds/runtime/instructions/ooc/ReblockOOCInstruction.java
@@ -41,7 +41,7 @@
public class ReblockOOCInstruction extends ComputationOOCInstruction {
private int blen;
- private ReblockOOCInstruction(Operator op, CPOperand in, CPOperand out,
+ private ReblockOOCInstruction(Operator op, CPOperand in, CPOperand out,
int br, int bc, String opcode, String instr)
{
super(OOCType.Reblock, op, in, out, opcode, instr);
@@ -71,29 +71,29 @@ public void processInstruction(ExecutionContext ec) {
//get the source format from the meta data
//MetaDataFormat iimd = (MetaDataFormat) min.getMetaData();
- //TODO support other formats than binary
-
+ //TODO support other formats than binary
+
//create queue, spawn thread for asynchronous reading, and return
OOCStream q = createWritableStream();
submitOOCTask(() -> readBinaryBlock(q, min.getFileName()), q);
-
+
MatrixObject mout = ec.getMatrixObject(output);
mout.setStreamHandle(q);
}
-
+
@SuppressWarnings("resource")
private void readBinaryBlock(OOCStream q, String fname) {
try {
//prepare file access
- JobConf job = new JobConf(ConfigurationManager.getCachedJobConf());
- Path path = new Path( fname );
+ JobConf job = new JobConf(ConfigurationManager.getCachedJobConf());
+ Path path = new Path( fname );
FileSystem fs = IOUtilFunctions.getFileSystem(path, job);
-
+
//check existence and non-empty file
- MatrixReader.checkValidInputFile(fs, path);
-
+ MatrixReader.checkValidInputFile(fs, path);
+
//core reading
- for( Path lpath : IOUtilFunctions.getSequenceFilePaths(fs, path) ) { //1..N files
+ for( Path lpath : IOUtilFunctions.getSequenceFilePaths(fs, path) ) { //1..N files
//directly read from sequence files (individual partfiles)
try( SequenceFile.Reader reader = new SequenceFile
.Reader(job, SequenceFile.Reader.file(lpath)) )
diff --git a/src/test/java/org/apache/sysds/test/functions/ooc/SumScalarMultiplicationTest.java b/src/test/java/org/apache/sysds/test/functions/ooc/SumScalarMultiplicationTest.java
index f0d9228a533..2272588bab4 100644
--- a/src/test/java/org/apache/sysds/test/functions/ooc/SumScalarMultiplicationTest.java
+++ b/src/test/java/org/apache/sysds/test/functions/ooc/SumScalarMultiplicationTest.java
@@ -23,7 +23,6 @@
import org.apache.sysds.common.Types;
import org.apache.sysds.common.Types.FileFormat;
import org.apache.sysds.common.Types.ValueType;
-import org.apache.sysds.hops.OptimizerUtils;
import org.apache.sysds.runtime.instructions.Instruction;
import org.apache.sysds.runtime.io.MatrixWriter;
import org.apache.sysds.runtime.io.MatrixWriterFactory;
@@ -58,26 +57,11 @@ public void setUp() {
* Test the sum of scalar multiplication, "sum(X*7)", with OOC backend.
*/
@Test
- public void testSumScalarMultNoRewrite() {
- testSumScalarMult(false);
- }
-
- /**
- * Test the sum of scalar multiplication, "sum(X)*7", with OOC backend.
- */
- @Test
- public void testSumScalarMultRewrite() {
- testSumScalarMult(true);
- }
-
-
- public void testSumScalarMult(boolean rewrite)
- {
+ public void testSumScalarMult() {
+
Types.ExecMode platformOld = rtplatform;
rtplatform = Types.ExecMode.SINGLE_NODE;
- boolean oldRewrite = OptimizerUtils.ALLOW_ALGEBRAIC_SIMPLIFICATION;
- OptimizerUtils.ALLOW_ALGEBRAIC_SIMPLIFICATION = rewrite;
-
+
try {
getAndLoadTestConfiguration(TEST_NAME);
String HOME = SCRIPT_DIR + TEST_DIR;
@@ -108,17 +92,16 @@ public void testSumScalarMult(boolean rewrite)
String prefix = Instruction.OOC_INST_PREFIX;
Assert.assertTrue("OOC wasn't used for RBLK",
heavyHittersContainsString(prefix + Opcodes.RBLK));
- if(!rewrite)
- Assert.assertTrue("OOC wasn't used for SUM",
- heavyHittersContainsString(prefix + Opcodes.MULT));
Assert.assertTrue("OOC wasn't used for SUM",
heavyHittersContainsString(prefix + Opcodes.UAKP));
+
+// boolean usedOOCMult = Statistics.getCPHeavyHitterOpCodes().contains(prefix + Opcodes.MULT);
+// Assert.assertTrue("OOC wasn't used for MULT", usedOOCMult);
}
catch(Exception ex) {
Assert.fail(ex.getMessage());
}
finally {
- OptimizerUtils.ALLOW_ALGEBRAIC_SIMPLIFICATION = oldRewrite;
resetExecMode(platformOld);
}
}