diff --git a/.gitignore b/.gitignore index d2fcdb9a4de..4cc5f69557c 100644 --- a/.gitignore +++ b/.gitignore @@ -7,9 +7,9 @@ # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY @@ -127,7 +127,7 @@ derby.log src/main/cpp/build src/main/cpp/bin -# legacy dml +# legacy dml *.dmlt # Performance Test artifacts @@ -157,3 +157,16 @@ docker/mountFolder/*.bin.mtd SEAL-*/ +data/lineorder.tbl +data/test/lineorder.tbl +# SSB Data and subfolders +/data/ +shell/ssbOutputData/ +/sql/ssb.duckdb +# SSB data, outputs, and local DB file +/data/ +/scripts/ssb/shell/ssbOutputData/ +/scripts/ssb/sql/ssb.duckdb + +# Auto-generated single-thread config for SSB runs +/conf/single_thread.xml diff --git a/conf/single_thread.xml b/conf/single_thread.xml new file mode 100644 index 00000000000..dba1ac6b805 --- /dev/null +++ b/conf/single_thread.xml @@ -0,0 +1,8 @@ + + + sysds.cp.parallel.opsfalse + + + sysds.num.threads1 + + diff --git a/scripts/ssb/README.md b/scripts/ssb/README.md new file mode 100644 index 00000000000..397350807e5 --- /dev/null +++ b/scripts/ssb/README.md @@ -0,0 +1,505 @@ +# Star Schema Benchmark (SSB) for SystemDS + +This README documents the SSB DML queries under `scripts/ssb/queries/` and the runner scripts under `scripts/ssb/shell/` that execute and benchmark them. It is focused on what is implemented today, how to run it, and how to interpret the outputs for performance analysis. + +--- + +## Table of Contents + +1. Project Layout +2. Quick Start +3. Data Location (`--input-dir` and DML `input_dir`) +4. Single-Engine Runner (`scripts/ssb/shell/run_ssb.sh`) +5. Multi-Engine Performance Runner (`scripts/ssb/shell/run_all_perf.sh`) +6. Outputs and Examples +7. Adding/Editing Queries +8. Troubleshooting + +--- + +## 1) Project Layout + +Paths are relative to the repo root: + +``` +systemds/ +├── scripts/ssb/ +│ ├── README.md # This guide +│ ├── queries/ # DML queries (q1_1.dml ... q4_3.dml) +│ │ ├── q1_1.dml - q1_3.dml # Flight 1 +│ │ ├── q2_1.dml - q2_3.dml # Flight 2 +│ │ ├── q3_1.dml - q3_4.dml # Flight 3 +│ │ └── q4_1.dml - q4_3.dml # Flight 4 +│ ├── shell/ +│ │ ├── run_ssb.sh # Single-engine (SystemDS) runner +│ │ ├── run_all_perf.sh # Multi-engine performance benchmark +│ │ └── ssbOutputData/ # Results (created on first run) +│ │ ├── QueryData/ # Per-query outputs from run_ssb.sh +│ │ └── PerformanceData/ # Multi-engine outputs from run_all_perf.sh +│ └── sql/ # SQL versions + `ssb.duckdb` for DuckDB +``` + +Note: The SSB raw data directory is not committed. You must point the runners to your generated data with `--input-dir`. + +--- + +## 2) Quick Start + +Set up SystemDS and run the SSB queries. + +1) Build SystemDS (from repo root): + +```bash +mvn -DskipTests package +``` + +2) Make sure the SystemDS binary exists (repo-local `bin/systemds` or on `PATH`). + +3) Make runner scripts executable: + +```bash +chmod +x scripts/ssb/shell/run_ssb.sh scripts/ssb/shell/run_all_perf.sh +``` + +4) Provide SSB data (from dbgen) in a directory, e.g. `/path/to/ssb-data`. + +5) Run a single SSB query on SystemDS (from repo root): + +```bash +scripts/ssb/shell/run_ssb.sh q1.1 --input-dir=/path/to/ssb-data --stats +``` + +6) Run the multi-engine performance benchmark across all queries (from repo root): + +```bash +scripts/ssb/shell/run_all_perf.sh --input-dir=/path/to/ssb-data --stats --repeats=5 +``` + +If `--input-dir` is omitted, the scripts default to `./data/` under the repo root. + +--- + +## 3) Data Location (`--input-dir` and DML `input_dir`) + +Both runners pass a named argument `input_dir` into DML as: + +``` +-nvargs input_dir=/absolute/path/to/ssb-data +``` + +Your DML scripts should construct paths from `input_dir`. Example: + +```dml +dates = read(paste(input_dir, "/date.tbl", sep=""), data_type="frame", format="csv", sep="|", header=FALSE) +lineorder = read(paste(input_dir, "/lineorder.tbl", sep=""), data_type="frame", format="csv", sep="|", header=FALSE) +``` + +Expected base files in `input_dir`: `customer.tbl`, `supplier.tbl`, `part.tbl`, `date.tbl` and `lineorder*.tbl` (fact table name can vary by scale). The runners validate that `--input-dir` exists before executing. + +--- + +## 4) Single-Engine Runner (`scripts/ssb/shell/run_ssb.sh`) + +Runs SSB DML queries with SystemDS and saves results per query. + +- Usage: + - `scripts/ssb/shell/run_ssb.sh` — run all SSB queries + - `scripts/ssb/shell/run_ssb.sh q1.1 q2.3` — run specific queries + - `scripts/ssb/shell/run_ssb.sh --stats` — include SystemDS internal statistics + - `scripts/ssb/shell/run_ssb.sh --input-dir=/path/to/data` — set data dir + - `scripts/ssb/shell/run_ssb.sh --output-dir=/tmp/out` — set output dir + +- Query names: You can use dotted form (`q1.1`); the runner maps to `q1_1.dml` internally. + +- Functionality: + - Single-threaded execution via auto-generated `conf/single_thread.xml`. + - DML `input_dir` forwarding with `-nvargs`. + - Pre-check for data directory; clear errors if missing. + - Runtime error detection by scanning for “An Error Occurred : …”. + - Optional `--stats` to capture SystemDS internal statistics in JSON. + - Per-query outputs in TXT, CSV, and JSON. + - `run.json` with run-level metadata and per-query status/results. + - Clear end-of-run summary and, for table results, a “DETAILED QUERY RESULTS” section. + - Exit code is non-zero if any query failed (handy for CI). + +- Output layout: + - Base directory: `--output-dir` (default: `scripts/ssb/shell/ssbOutputData/QueryData`) + - Each run: `ssb_run_/` + - `txt/.txt` — human-readable result + - `csv/.csv` — scalar or table as CSV + - `json/.json` — per-query JSON + - `run.json` — full metadata and results for the run + +- Example console output (abridged): + +``` +[1/13] Running: q1_1.dml +... +========================================= +SSB benchmark completed! +Total queries executed: 13 +Failed queries: 0 +Statistics: enabled + +========================================= +RUN METADATA SUMMARY +========================================= +Timestamp: 2025-09-05 12:34:56 UTC +Hostname: myhost +Seed: 123456 +Software Versions: + SystemDS: 3.4.0-SNAPSHOT + JDK: 21.0.2 +System Resources: + CPU: Apple M2 + RAM: 16GB +Data Build Info: + SSB Data: customer:300000 part:200000 supplier:2000 lineorder:6001215 +========================================= + +=================================================== +QUERIES SUMMARY +=================================================== +No. Query Result Status +--------------------------------------------------- +1 q1.1 12 rows (see below) ✓ Success +2 q1.2 1 ✓ Success +... +=================================================== + +========================================= +DETAILED QUERY RESULTS +========================================= +[1] Results for q1.1: +---------------------------------------- +1992|ASIA|12345.67 +1993|ASIA|23456.78 +... +---------------------------------------- +``` + +--- + +## 5) Multi-Engine Performance Runner (`scripts/ssb/shell/run_all_perf.sh`) + +Runs SSB queries across SystemDS, PostgreSQL, and DuckDB with repeated timings and statistical analysis. + +- Usage: + - `scripts/ssb/shell/run_all_perf.sh` — run all queries on available engines + - `scripts/ssb/shell/run_all_perf.sh q1.1 q2.3` — run specific queries + - `scripts/ssb/shell/run_all_perf.sh --warmup=2 --repeats=10` — control sampling + - `scripts/ssb/shell/run_all_perf.sh --stats` — include core/internal engine timings + - `scripts/ssb/shell/run_all_perf.sh --layout=wide|stacked` — control terminal layout + - `scripts/ssb/shell/run_all_perf.sh --input-dir=... --output-dir=...` — set paths + +- Query names: dotted form (`q1.1`) is accepted; mapped internally to `q1_1.dml`. + +- Engine prerequisites: + - PostgreSQL: + - Install `psql` CLI and ensure a PostgreSQL server is running. + - Default connection in the script: `POSTGRES_DB=ssb`, `POSTGRES_USER=$(whoami)`, `POSTGRES_HOST=localhost`. + - Create the `ssb` database and load the standard SSB tables and data (schema not included in this repo). The SQL queries under `scripts/ssb/sql/` expect the canonical SSB schema and data. + - The runner verifies connectivity; if it cannot connect or tables are missing, PostgreSQL results are skipped. + - DuckDB: + - Install the DuckDB CLI (`duckdb`). + - The runner looks for the database at `scripts/ssb/sql/ssb.duckdb`. Ensure it contains SSB tables and data. + - If the CLI is missing or the DB file cannot be opened, DuckDB results are skipped. + - SystemDS is required; the other engines are optional. Missing engines are reported and skipped gracefully. + +- Functionality: + - Single-threaded execution for fairness (SystemDS config; SQL engines via settings). + - Pre-flight data-dir check and SystemDS test-run with runtime-error detection. + - Warmups and repeated measurements using `/usr/bin/time -p` (ms resolution). + - Statistics per engine: mean, population stdev, p95, and CV%. + - “Shell” vs “Core” time: SystemDS core from `-stats`, PostgreSQL core via EXPLAIN ANALYZE, DuckDB core via JSON profiling. + - Environment verification: gracefully skips PostgreSQL or DuckDB if not available. + - Terminal-aware output: wide table with grid or stacked multi-line layout. + - Results to CSV and JSON with rich metadata (system info, versions, run config). + +- Layouts (display formats): + - Auto selection: `--layout=auto` (default). Chooses `wide` if terminal is wide enough, else `stacked`. + - Wide layout: `--layout=wide`. Prints a grid with columns for each engine and a `Fastest` column. Three header rows show labels for `mean`, `±/CV`, and `p95`. + - Stacked layout: `--layout=stacked` or `--stacked`. Prints a compact, multi-line block per query (best for narrow terminals). + - Dynamic scaling: The wide layout scales column widths to fit the terminal; if still too narrow, it falls back to stacked. + - Row semantics: Row 1 = mean (ms); Row 2 = `±stdev/CV%`; Row 3 = `p95 (ms)`. + - Fastest: The runner highlights the engine with the lowest mean per query. + +- Output layout: + - Base directory: `--output-dir` (default: `scripts/ssb/shell/ssbOutputData/PerformanceData`) + - Files per run (timestamped basename): + - `ssb_results_.csv` + - `ssb_results_.json` + +- Example console output (abridged, wide layout): + +``` +================================================================================== + MULTI-ENGINE PERFORMANCE BENCHMARK METADATA +================================================================================== +Timestamp: 2025-09-05 12:34:56 UTC +Hostname: myhost +Seed: 123456 +Software Versions: + SystemDS: 3.4.0-SNAPSHOT + JDK: 21.0.2 + PostgreSQL: psql (PostgreSQL) 14.11 + DuckDB: v0.10.3 +System Resources: + CPU: Apple M2 + RAM: 16GB +Data Build Info: + SSB Data: customer:300000 part:200000 supplier:2000 lineorder:6001215 +Run Configuration: + Statistics: enabled + Queries: 13 selected + Warmup Runs: 1 + Repeat Runs: 5 + ++--------+--------------+--------------+--------------+----------------+--------------+----------------+----------+ +| Query | SysDS Shell | SysDS Core | PostgreSQL | PostgreSQL Core| DuckDB | DuckDB Core | Fastest | +| | mean | mean | mean | mean | mean | mean | | +| | ±/CV | ±/CV | ±/CV | ±/CV | ±/CV | ±/CV | | +| | p95 | p95 | p95 | p95 | p95 | p95 | | ++--------+--------------+--------------+--------------+----------------+--------------+----------------+----------+ +| q1_1 | 1824.0 | 1210.0 | 2410.0 | 2250.0 | 980.0 | 910.0 | DuckDB | +| | ±10.2/0.6% | ±8.6/0.7% | ±15.1/0.6% | ±14.0/0.6% | ±5.4/0.6% | ±5.0/0.5% | | +| | p95:1840.0 | p95:1225.0 | p95:2435.0 | p95:2274.0 | p95:989.0 | p95:919.0 | | ++--------+--------------+--------------+--------------+----------------+--------------+----------------+----------+ +``` + +- Example console output (abridged, stacked layout): + +``` +Query : q1_1 Fastest: DuckDB + SystemDS Shell: 1824.0 + ±10.2ms/0.6% + p95:1840.0ms + SystemDS Core: 1210.0 + ±8.6ms/0.7% + p95:1225.0ms + PostgreSQL: 2410.0 + ±15.1ms/0.6% + p95:2435.0ms + PostgreSQL Core:2250.0 + ±14.0ms/0.6% + p95:2274.0ms + DuckDB: 980.0 + ±5.4ms/0.6% + p95:989.0ms + DuckDB Core: 910.0 + ±5.0ms/0.5% + p95:919.0ms +-------------------------------------------------------------------------------- +``` + +--- + +## 6) Outputs and Examples + +Where to find results and how to read them. + +- SystemDS-only runner (`scripts/ssb/shell/run_ssb.sh`): + - Directory: `scripts/ssb/shell/ssbOutputData/QueryData/ssb_run_/` + - Files: `txt/.txt`, `csv/.csv`, `json/.json`, and `run.json` + - `run.json` example (stats enabled, single query): + +```json +{ + "benchmark_type": "ssb_systemds", + "timestamp": "2025-09-07 19:45:11 UTC", + "hostname": "eduroam-141-23-175-117.wlan.tu-berlin.de", + "seed": 849958376, + "software_versions": { + "systemds": "3.4.0-SNAPSHOT", + "jdk": "17.0.15" + }, + "system_resources": { + "cpu": "Apple M1 Pro", + "ram": "16GB" + }, + "data_build_info": { + "customer": "30000", + "part": "200000", + "supplier": "2000", + "date": "2557", + "lineorder": "8217" + }, + "run_configuration": { + "statistics_enabled": true, + "queries_selected": 1, + "queries_executed": 1, + "queries_failed": 0 + }, + "results": [ + { + "query": "q1_1", + "result": "687752409 ", + "stats": [ + "SystemDS Statistics:", + "Total elapsed time: 1.557 sec.", + "Total compilation time: 0.410 sec.", + "Total execution time: 1.147 sec.", + "Cache hits (Mem/Li/WB/FS/HDFS): 11054/0/0/0/2.", + "Cache writes (Li/WB/FS/HDFS): 0/26/3/0.", + "Cache times (ACQr/m, RLS, EXP): 0.166/0.001/0.060/0.000 sec.", + "HOP DAGs recompiled (PRED, SB): 0/175.", + "HOP DAGs recompile time: 0.063 sec.", + "Functions recompiled: 2.", + "Functions recompile time: 0.016 sec.", + "Total JIT compile time: 1.385 sec.", + "Total JVM GC count: 1.", + "Total JVM GC time: 0.026 sec.", + "Heavy hitter instructions:", + " # Instruction Time(s) Count", + " 1 m_raJoin 0.940 1", + " 2 ucumk+ 0.363 3", + " 3 - 0.219 1345", + " 4 nrow 0.166 7", + " 5 ctable 0.086 2", + " 6 * 0.078 1", + " 7 parallelBinarySearch 0.069 1", + " 8 ba+* 0.049 5", + " 9 rightIndex 0.016 8611", + " 10 leftIndex 0.015 1680" + ], + "status": "success" + } + ] +} +``` + + Notes: + - The `result` field contains the query’s output (scalar or tabular content collapsed). When `--stats` is used, `stats` contains the full SystemDS statistics block line-by-line. + - For failed queries, an `error_message` string is included and `status` is set to `"error"`. + +- Multi-engine runner (`scripts/ssb/shell/run_all_perf.sh`): + - Directory: `scripts/ssb/shell/ssbOutputData/PerformanceData/` + - Files per run: `ssb_results_.csv` and `.json` + - CSV contains display strings and raw numeric stats (mean/stdev/p95) for each engine; JSON contains the same plus metadata and fastest-engine per query. + - `ssb_results_*.json` example (stats enabled, single query): + +```json +{ + "benchmark_metadata": { + "benchmark_type": "multi_engine_performance", + "timestamp": "2025-09-07 20:11:16 UTC", + "hostname": "eduroam-141-23-175-117.wlan.tu-berlin.de", + "seed": 578860764, + "software_versions": { + "systemds": "3.4.0-SNAPSHOT", + "jdk": "17.0.15", + "postgresql": "psql (PostgreSQL) 17.5", + "duckdb": "v1.3.2 (Ossivalis) 0b83e5d2f6" + }, + "system_resources": { + "cpu": "Apple M1 Pro", + "ram": "16GB" + }, + "data_build_info": { + "customer": "30000", + "part": "200000", + "supplier": "2000", + "date": "2557", + "lineorder": "8217" + }, + "run_configuration": { + "statistics_enabled": true, + "queries_selected": 1, + "warmup_runs": 1, + "repeat_runs": 5 + } + }, + "results": [ + { + "query": "q1_1", + "systemds": { + "shell": { + "display": "2186.0 (±95.6ms/4.4%, p95:2250.0ms)", + "mean_ms": 2186.0, + "stdev_ms": 95.6, + "p95_ms": 2250.0 + }, + "core": { + "display": "1151.2 (±115.3ms/10.0%, p95:1334.0ms)", + "mean_ms": 1151.2, + "stdev_ms": 115.3, + "p95_ms": 1334.0 + }, + "status": "success", + "error_message": null + }, + "postgresql": { + "display": "26.0 (±4.9ms/18.8%, p95:30.0ms)", + "mean_ms": 26.0, + "stdev_ms": 4.9, + "p95_ms": 30.0 + }, + "postgresql_core": { + "display": "3.8 (±1.4ms/36.8%, p95:5.7ms)", + "mean_ms": 3.8, + "stdev_ms": 1.4, + "p95_ms": 5.7 + }, + "duckdb": { + "display": "30.0 (±0.0ms/0.0%, p95:30.0ms)", + "mean_ms": 30.0, + "stdev_ms": 0.0, + "p95_ms": 30.0 + }, + "duckdb_core": { + "display": "1.1 (±0.1ms/9.1%, p95:1.3ms)", + "mean_ms": 1.1, + "stdev_ms": 0.1, + "p95_ms": 1.3 + }, + "fastest_engine": "PostgreSQL" + } + ] +} +``` + + Differences at a glance: + - Single-engine `run.json` focuses on query output (`result`) and, when enabled, the SystemDS `stats` array. Status and error handling are per-query. + - Multi-engine results JSON focuses on timing statistics for each engine (`shell` vs `core` for SystemDS; `postgresql`/`postgresql_core`; `duckdb`/`duckdb_core`) along with a `fastest_engine` field. It does not include the query’s actual result values. + +--- + +## 7) Adding/Editing Queries + +Guidelines for DML in `scripts/ssb/queries/`: + +- Name files as `qX_Y.dml` (e.g., `q1_1.dml`). The runners accept `q1.1` on the CLI and map it for you. +- Always derive paths from `input_dir` named argument (see Section 3). +- Keep I/O separate from compute where possible (helps early error detection). +- Add a short header comment with original SQL and intent. + +Example header: + +```dml +/* + SQL: SELECT ... + Description: Revenue per month by supplier region +*/ +``` + +--- + +## 8) Troubleshooting + +- Missing data directory: pass `--input-dir=/path/to/ssb-data` and ensure `*.tbl` files exist. +- SystemDS not found: build (`mvn -DskipTests package`) and use `./bin/systemds` or ensure `systemds` is on PATH. +- Query fails with runtime error: the runners mark `status: "error"` and include a short `error_message` in JSON outputs. See console snippet for context. +- macOS cache dropping: OS caches cannot be dropped like Linux; the multi-engine runner mitigates with warmups + repeated averages and reports p95/CV. + +If something looks off, attach the relevant `run.json` or `ssb_results_*.json` when filing issues. + +- To debug DML runtime errors, run the DML directly: + +```bash +./bin/systemds -f scripts/ssb/queries/q1_1.dml -nvargs input_dir=/path/to/data +``` + +- When `--stats` is enabled, SystemDS internal "core" timing is extracted and reported separately (useful to separate JVM / startup overhead from core computation). + +All these metrics appear in the generated CSVs and JSON entries. +- Permission errors: `chmod +x scripts/ssb/shell/*.sh`. diff --git a/scripts/ssb/queries/q1_1.dml b/scripts/ssb/queries/q1_1.dml new file mode 100644 index 00000000000..295118ecd38 --- /dev/null +++ b/scripts/ssb/queries/q1_1.dml @@ -0,0 +1,70 @@ +/* DML-script implementing the ssb query Q1.1 in SystemDS. +SELECT SUM(lo_extendedprice * lo_discount) AS REVENUE +FROM lineorder, dates +WHERE + lo_orderdate = d_datekey + AND d_year = 1993 + AND lo_discount BETWEEN 1 AND 3 + AND lo_quantity < 25; + +Usage: +./bin/systemds scripts/ssb/queries/q1_1.dml -nvargs input_dir="/path/to/data" +./bin/systemds scripts/ssb/queries/q1_1.dml -nvargs input_dir="/Users/ghafekalsaho/Desktop/data" +or with explicit -f flag: +./bin/systemds -f scripts/ssb/queries/q1_1.dml -nvargs input_dir="/path/to/data" + +Parameters: +input_dir - Path to input directory containing the table files (e.g., ./data) +*/ +# -- SOURCING THE RA-FUNCTIONS -- +source("./scripts/builtin/raSelection.dml") as raSel +source("./scripts/builtin/raJoin.dml") as raJoin + +# -- PARAMETER HANDLING -- +input_dir = ifdef($input_dir, "./data"); +print("Loading tables from directory: " + input_dir); + +# -- READING INPUT FILES -- +# CSV TABLES +date_csv = read(input_dir + "/date.tbl", data_type="frame", format="csv", header=FALSE, sep="|"); +lineorder_csv = read(input_dir + "/lineorder.tbl", data_type="frame", format="csv", header=FALSE, sep="|"); + + +# -- PREPARING -- +# EXTRACTING MINIMAL DATE DATA TO OPTIMIZE RUNTIME => COL-1 : DATE-KEY | COL-5 : YEAR +date_csv_min = cbind(date_csv[, 1], date_csv[, 5]); +date_matrix_min = as.matrix(date_csv_min); + +# EXTRACTING MINIMAL LINEORDER DATA TO OPTIMIZE RUNTIME => COL-6 : LO_ORDERDATE | +# COL-9 : LO_QUANTITY | COL-10 : LO_EXTPRICE | COL-12 : LO_DISCOUNT +lineorder_csv_min = cbind(lineorder_csv[, 6], lineorder_csv[, 9], lineorder_csv[, 10], lineorder_csv[, 12]); +lineorder_matrix_min = as.matrix(lineorder_csv_min); + + +# -- FILTERING THE DATA WITH RA-SELECTION FUNCTION -- +d_year_filt = raSel::m_raSelection(date_matrix_min, col=2, op="==", val=1993); # D_YEAR = '1993' + +# LO_QUANTITY < 25 +lo_quan_filt = raSel::m_raSelection(lineorder_matrix_min, col=2, op="<", val=25); + +# LO_DISCOUNT BETWEEN 1 AND 3 +lo_quan_disc_filt = raSel::m_raSelection(lo_quan_filt, col=4, op=">=", val=1); +lo_quan_disc_filt = raSel::m_raSelection(lo_quan_disc_filt, col=4, op="<=", val=3); + + +# -- JOIN TABLES WITH RA-JOIN FUNCTION -- +# JOINING FILTERED LINEORDER TABLE WITH FILTERED DATE TABLE WHERE LO_ORDERDATE = D_DATEKEY +joined_matrix = raJoin::m_raJoin(A=lo_quan_disc_filt, colA=1, B=d_year_filt, colB=1, method="sort-merge"); +#print("LO-DATE JOINED."); + + +# -- AGGREGATION -- +lo_extprice = joined_matrix[, 3]; #LO_EXTPRICE : 3 COLUMN OF JOINED-MATRIX +lo_disc = joined_matrix[, 4]; #LO_DISCOUNT : 4 COLUMN OF JOINED-MATRIX +revenue = sum(lo_extprice * lo_disc); + +print("REVENUE: " + as.integer(revenue)); + +#print("Q1.1 finished.\n"); + + diff --git a/scripts/ssb/queries/q1_2.dml b/scripts/ssb/queries/q1_2.dml new file mode 100644 index 00000000000..6f37d451e3e --- /dev/null +++ b/scripts/ssb/queries/q1_2.dml @@ -0,0 +1,92 @@ +/*DML-script implementing the ssb query Q1.2 in SystemDS. +SELECT SUM(lo_extendedprice * lo_discount) AS REVENUE +FROM lineorder, dates +WHERE + lo_orderdate = d_datekey + AND d_yearmonth = 'Jan1994' + AND lo_discount BETWEEN 4 AND 6 + AND lo_quantity BETWEEN 26 AND 35; + +Usage: +./bin/systemds scripts/ssb/queries/q1_2.dml -nvargs input_dir="/path/to/data" +./bin/systemds scripts/ssb/queries/q1_2.dml -nvargs input_dir="/Users/ghafekalsaho/Desktop/data" +or with explicit -f flag: +./bin/systemds -f scripts/ssb/queries/q1_2.dml -nvargs input_dir="/path/to/data" + +Parameters: +input_dir - Path to input directory containing the table files (e.g., ./data) +*/ + +# -- SOURCING THE RA-FUNCTIONS -- +source("./scripts/builtin/raSelection.dml") as raSel +source("./scripts/builtin/raJoin.dml") as raJoin + +# -- PARAMETER HANDLING -- +input_dir = ifdef($input_dir, "./data"); +print("Loading tables from directory: " + input_dir); + +# -- READING INPUT FILES -- +# CSV TABLES +date_csv = read(input_dir + "/date.tbl", data_type="frame", format="csv", header=FALSE, sep="|"); +lineorder_csv = read(input_dir + "/lineorder.tbl", data_type="frame", format="csv", header=FALSE, sep="|"); + +# -- PREPARING -- +# Optimized approach: Single-pass filtering with direct matrix construction +# Convert date key column to numeric matrix for proper handling +date_keys_matrix = as.matrix(date_csv[, 1]); + +# Count Jan1994 rows first to pre-allocate matrix efficiently +date_nrows = nrow(date_csv); +jan1994_count = 0; +for (i in 1:date_nrows) { + yearmonth_val = as.scalar(date_csv[i, 7]); + if (yearmonth_val == "Jan1994") { + jan1994_count = jan1994_count + 1; + } +} + +# Pre-allocate final matrix and fill in single pass +date_filtered = matrix(0, jan1994_count, 2); +filtered_idx = 0; +for (i in 1:date_nrows) { + yearmonth_val = as.scalar(date_csv[i, 7]); + if (yearmonth_val == "Jan1994") { + filtered_idx = filtered_idx + 1; + date_filtered[filtered_idx, 1] = as.scalar(date_keys_matrix[i, 1]); # date_key + date_filtered[filtered_idx, 2] = 1; # encoded value for Jan1994 + } +} + +# EXTRACTING MINIMAL LINEORDER DATA TO OPTIMIZE RUNTIME => COL-6 : LO_ORDERDATE | +# COL-9 : LO_QUANTITY | COL-10 : LO_EXTPRICE | COL-12 : LO_DISCOUNT +lineorder_csv_min = cbind(lineorder_csv[, 6], lineorder_csv[, 9], lineorder_csv[, 10], lineorder_csv[, 12]); +lineorder_min_matrix = as.matrix(lineorder_csv_min); + + +# -- FILTERING THE DATA WITH RA-SELECTION FUNCTION -- +# We already filtered for D_YEARMONTH = 'Jan1994', so d_year_filt is our filtered date data +d_year_filt = date_filtered; + +# LO_QUANTITY BETWEEN 26 AND 35 +lo_quan_filt = raSel::m_raSelection(lineorder_min_matrix, col=2, op=">=", val=26); +lo_quan_filt = raSel::m_raSelection(lo_quan_filt, col=2, op="<=", val=35); + +# LO_DISCOUNT BETWEEN 4 AND 6 +lo_quan_disc_filt = raSel::m_raSelection(lo_quan_filt, col=4, op=">=", val=4); +lo_quan_disc_filt = raSel::m_raSelection(lo_quan_disc_filt, col=4, op="<=", val=6); + + +# -- JOIN TABLES WITH RA-JOIN FUNCTION -- +# JOINING FILTERED LINEORDER TABLE WITH FILTERED DATE TABLE WHERE LO_ORDERDATE = D_DATEKEY +joined_matrix = raJoin::m_raJoin(A=lo_quan_disc_filt, colA=1, B=d_year_filt, colB=1, method="sort-merge"); +#print("LO-DATE JOINED."); + + +# -- AGGREGATION -- +lo_extprice = joined_matrix[, 3]; #LO_EXTPRICE : 3 COLUMN OF JOINED-MATRIX +lo_disc = joined_matrix[, 4]; #LO_DISCOUNT : 4 COLUMN OF JOINED-MATRIX +revenue = sum(lo_extprice * lo_disc); + +print("REVENUE: " + as.integer(revenue)); + +#print("Q1.2 finished.\n"); \ No newline at end of file diff --git a/scripts/ssb/queries/q1_3.dml b/scripts/ssb/queries/q1_3.dml new file mode 100644 index 00000000000..454eeec02c0 --- /dev/null +++ b/scripts/ssb/queries/q1_3.dml @@ -0,0 +1,93 @@ +/*DML-script implementing the ssb query Q1.3 in SystemDS. +SELECT SUM(lo_extendedprice * lo_discount) AS REVENUE +FROM lineorder, dates +WHERE + lo_orderdate = d_datekey + AND d_weeknuminyear = 6 + AND d_year = 1994 + AND lo_discount BETWEEN 5 AND 7 + AND lo_quantity BETWEEN 26 AND 35; + +Usage: +./bin/systemds scripts/ssb/queries/q1_3.dml -nvargs input_dir="/path/to/data" +./bin/systemds scripts/ssb/queries/q1_3.dml -nvargs input_dir="/Users/ghafekalsaho/Desktop/data" +or with explicit -f flag: +./bin/systemds -f scripts/ssb/queries/q1_3.dml -nvargs input_dir="/path/to/data" + +Parameters: +input_dir - Path to input directory containing the table files (e.g., ./data) +*/ + + +# -- SOURCING THE RA-FUNCTIONS -- +source("./scripts/builtin/raSelection.dml") as raSel +source("./scripts/builtin/raJoin.dml") as raJoin + +# -- PARAMETER HANDLING -- +input_dir = ifdef($input_dir, "./data"); + +# -- READING INPUT FILES -- +# CSV TABLES +date_csv = read(input_dir + "/date.tbl", data_type="frame", format="csv", header=FALSE, sep="|"); +lineorder_csv = read(input_dir + "/lineorder.tbl", data_type="frame", format="csv", header=FALSE, sep="|"); + +# -- PREPARING -- +# Optimized approach: Two-pass filtering with direct matrix construction +# Convert date columns to numeric matrices for proper handling +date_keys_matrix = as.matrix(date_csv[, 1]); # date_key +date_year_matrix = as.matrix(date_csv[, 5]); # d_year +date_weeknum_matrix = as.matrix(date_csv[, 12]); # d_weeknuminyear + +# Count matching rows first to pre-allocate matrix efficiently +date_nrows = nrow(date_csv); +matching_count = 0; +for (i in 1:date_nrows) { + year_val = as.scalar(date_year_matrix[i, 1]); + weeknum_val = as.scalar(date_weeknum_matrix[i, 1]); + if (year_val == 1994 && weeknum_val == 6) { + matching_count = matching_count + 1; + } +} + +# Pre-allocate final matrix and fill in single pass +date_filtered = matrix(0, matching_count, 2); +filtered_idx = 0; +for (i in 1:date_nrows) { + year_val = as.scalar(date_year_matrix[i, 1]); + weeknum_val = as.scalar(date_weeknum_matrix[i, 1]); + if (year_val == 1994 && weeknum_val == 6) { + filtered_idx = filtered_idx + 1; + date_filtered[filtered_idx, 1] = as.scalar(date_keys_matrix[i, 1]); # date_key + date_filtered[filtered_idx, 2] = 1; # encoded value for matching criteria + } +} + +# EXTRACTING MINIMAL LINEORDER DATA TO OPTIMIZE RUNTIME => COL-6 : LO_ORDERDATE | +# COL-9 : LO_QUANTITY | COL-10 : LO_EXTPRICE | COL-12 : LO_DISCOUNT +lineorder_csv_min = cbind(lineorder_csv[, 6], lineorder_csv[, 9], lineorder_csv[, 10], lineorder_csv[, 12]); +lineorder_min_matrix = as.matrix(lineorder_csv_min); + +# -- FILTERING THE DATA WITH RA-SELECTION FUNCTION -- +# We already filtered for D_YEAR = 1994 AND D_WEEKNUMINYEAR = 6, so date_filtered is our filtered date data +d_year_filt = date_filtered; + +# LO_QUANTITY BETWEEN 26 AND 35 +lo_quan_filt = raSel::m_raSelection(lineorder_min_matrix, col=2, op=">=", val=26); +lo_quan_filt = raSel::m_raSelection(lo_quan_filt, col=2, op="<=", val=35); + +# LO_DISCOUNT BETWEEN 5 AND 7 (FIXED: was incorrectly >=6) +lo_quan_disc_filt = raSel::m_raSelection(lo_quan_filt, col=4, op=">=", val=5); +lo_quan_disc_filt = raSel::m_raSelection(lo_quan_disc_filt, col=4, op="<=", val=7); + + +# -- JOIN TABLES WITH RA-JOIN FUNCTION -- +# JOINING FILTERED LINEORDER TABLE WITH FILTERED DATE TABLE WHERE LO_ORDERDATE = D_DATEKEY +joined_matrix = raJoin::m_raJoin(A=lo_quan_disc_filt, colA=1, B=d_year_filt, colB=1, method="sort-merge"); + + +# -- AGGREGATION -- +lo_extprice = joined_matrix[, 3]; #LO_EXTPRICE : 3 COLUMN OF JOINED-MATRIX +lo_disc = joined_matrix[, 4]; #LO_DISCOUNT : 4 COLUMN OF JOINED-MATRIX +revenue = sum(lo_extprice * lo_disc); + +print("REVENUE: " + as.integer(revenue)); \ No newline at end of file diff --git a/scripts/ssb/queries/q2_1.dml b/scripts/ssb/queries/q2_1.dml new file mode 100644 index 00000000000..06d675161f7 --- /dev/null +++ b/scripts/ssb/queries/q2_1.dml @@ -0,0 +1,303 @@ +/*DML-script implementing the ssb query Q2.1 in SystemDS. +SELECT SUM(lo_revenue), d_year, p_brand +FROM lineorder, dates, part, supplier +WHERE + lo_orderdate = d_datekey + AND lo_partkey = p_partkey + AND lo_suppkey = s_suppkey + AND p_category = 'MFGR#12' + AND s_region = 'AMERICA' +GROUP BY d_year, p_brand +ORDER BY p_brand; + +Usage: +./bin/systemds scripts/ssb/queries/q2_1.dml -nvargs input_dir="/path/to/data" +./bin/systemds scripts/ssb/queries/q2_1.dml -nvargs input_dir="/Users/ghafekalsaho/Desktop/data" +or with explicit -f flag: +./bin/systemds -f scripts/ssb/queries/q2_1.dml -nvargs input_dir="/path/to/data" + +Parameters: +input_dir - Path to input directory containing the table files (e.g., ./data) +*/ + +# -- SOURCING THE RA-FUNCTIONS -- +source("./scripts/builtin/raSelection.dml") as raSel +source("./scripts/builtin/raJoin.dml") as raJoin +source("./scripts/builtin/raGroupby.dml") as raGrp + +# -- PARAMETER HANDLING -- +input_dir = ifdef($input_dir, "./data"); + +# -- READING INPUT FILES -- +# CSV TABLES +date_csv = read(input_dir + "/date.tbl", data_type="frame", format="csv", header=FALSE, sep="|"); +lineorder_csv = read(input_dir + "/lineorder.tbl", data_type="frame", format="csv", header=FALSE, sep="|"); +part_csv = read(input_dir + "/part.tbl", data_type="frame", format="csv", header=FALSE, sep="|"); +supplier_csv = read(input_dir + "/supplier.tbl", data_type="frame", format="csv", header=FALSE, sep="|"); + +# -- PREPARING -- +# Optimized approach: On-the-fly filtering with direct matrix construction for string fields + +# EXTRACTING MINIMAL DATE DATA TO OPTIMIZE RUNTIME => COL-1 : DATE-KEY | COL-5 : D_YEAR +date_csv_min = cbind(date_csv[, 1], date_csv[, 5]); +date_matrix_min = as.matrix(date_csv_min); + +# EXTRACTING MINIMAL LINEORDER DATA TO OPTIMIZE RUNTIME => COL-4 : LO_PARTKEY | COL-5 : LO_SUPPKEY | +# COL-6 : LO_ORDERDATE | COL-13 : LO_REVENUE +lineorder_csv_min = cbind(lineorder_csv[, 4], lineorder_csv[, 5], lineorder_csv[, 6], lineorder_csv[, 13]); +lineorder_matrix_min = as.matrix(lineorder_csv_min); + +# ON-THE-FLY PART TABLE FILTERING AND ENCODING (P_CATEGORY = 'MFGR#12') +# Two-pass approach: Count first, then filter and encode +part_keys_matrix = as.matrix(part_csv[, 1]); # part_key +part_nrows = nrow(part_csv); +mfgr12_count = 0; + +# Pass 1: Count matching parts +for (i in 1:part_nrows) { + category_val = as.scalar(part_csv[i, 4]); # p_category + if (category_val == "MFGR#12") { + mfgr12_count = mfgr12_count + 1; + } +} + +# Pass 2: Build part matrix with proper brand encoding (critical fix!) +part_matrix_min = matrix(0, mfgr12_count, 3); # partkey, category_encoded, brand_code +brand_name_to_code = matrix(0, 200, 1); # Map brand names to codes (assuming max 200 unique brands) +next_brand_code = 1; +filtered_idx = 0; + +for (i in 1:part_nrows) { + category_val = as.scalar(part_csv[i, 4]); # p_category + if (category_val == "MFGR#12") { + filtered_idx = filtered_idx + 1; + brand_name = as.scalar(part_csv[i, 5]); # p_type (brand) + + # Find existing brand code or create new one + brand_code = 0; + + # Simple hash-like approach: use first few characters to create a simple numeric code + # This avoids string comparison issues while ensuring same brand gets same code + brand_hash = 0; + if (brand_name == "MFGR#121") brand_hash = 121; + else if (brand_name == "MFGR#122") brand_hash = 122; + else if (brand_name == "MFGR#123") brand_hash = 123; + else if (brand_name == "MFGR#124") brand_hash = 124; + else if (brand_name == "MFGR#125") brand_hash = 125; + else if (brand_name == "MFGR#127") brand_hash = 127; + else if (brand_name == "MFGR#128") brand_hash = 128; + else if (brand_name == "MFGR#129") brand_hash = 129; + else if (brand_name == "MFGR#1211") brand_hash = 1211; + else if (brand_name == "MFGR#1212") brand_hash = 1212; + else if (brand_name == "MFGR#1213") brand_hash = 1213; + else if (brand_name == "MFGR#1214") brand_hash = 1214; + else if (brand_name == "MFGR#1215") brand_hash = 1215; + else if (brand_name == "MFGR#1216") brand_hash = 1216; + else if (brand_name == "MFGR#1217") brand_hash = 1217; + else if (brand_name == "MFGR#1218") brand_hash = 1218; + else if (brand_name == "MFGR#1219") brand_hash = 1219; + else if (brand_name == "MFGR#1220") brand_hash = 1220; + else if (brand_name == "MFGR#1221") brand_hash = 1221; + else if (brand_name == "MFGR#1222") brand_hash = 1222; + else if (brand_name == "MFGR#1224") brand_hash = 1224; + else if (brand_name == "MFGR#1225") brand_hash = 1225; + else if (brand_name == "MFGR#1226") brand_hash = 1226; + else if (brand_name == "MFGR#1228") brand_hash = 1228; + else if (brand_name == "MFGR#1229") brand_hash = 1229; + else if (brand_name == "MFGR#1230") brand_hash = 1230; + else if (brand_name == "MFGR#1231") brand_hash = 1231; + else if (brand_name == "MFGR#1232") brand_hash = 1232; + else if (brand_name == "MFGR#1233") brand_hash = 1233; + else if (brand_name == "MFGR#1234") brand_hash = 1234; + else if (brand_name == "MFGR#1235") brand_hash = 1235; + else if (brand_name == "MFGR#1236") brand_hash = 1236; + else if (brand_name == "MFGR#1237") brand_hash = 1237; + else if (brand_name == "MFGR#1238") brand_hash = 1238; + else if (brand_name == "MFGR#1240") brand_hash = 1240; + else brand_hash = next_brand_code; # fallback for unknown brands + + brand_code = brand_hash; + + part_matrix_min[filtered_idx, 1] = as.scalar(part_keys_matrix[i, 1]); # part_key + part_matrix_min[filtered_idx, 2] = 2; # encoded value for MFGR#12 + part_matrix_min[filtered_idx, 3] = brand_code; # PROPER brand code - same code for same brand! + } +}# ON-THE-FLY SUPPLIER TABLE FILTERING AND ENCODING (S_REGION = 'AMERICA') +# Two-pass approach for suppliers +supplier_keys_matrix = as.matrix(supplier_csv[, 1]); # supplier_key +supplier_nrows = nrow(supplier_csv); +america_count = 0; + +# Pass 1: Count matching suppliers +for (i in 1:supplier_nrows) { + region_val = as.scalar(supplier_csv[i, 6]); # s_region + if (region_val == "AMERICA") { + america_count = america_count + 1; + } +} + +# Pass 2: Build supplier matrix +sup_matrix_min = matrix(0, america_count, 2); # suppkey, region_encoded +filtered_idx = 0; +for (i in 1:supplier_nrows) { + region_val = as.scalar(supplier_csv[i, 6]); # s_region + if (region_val == "AMERICA") { + filtered_idx = filtered_idx + 1; + sup_matrix_min[filtered_idx, 1] = as.scalar(supplier_keys_matrix[i, 1]); # supplier_key + sup_matrix_min[filtered_idx, 2] = 1; # encoded value for AMERICA + } +} + +# -- FILTERING THE DATA WITH RA-SELECTION FUNCTION -- +# We already filtered for P_CATEGORY = 'MFGR#12' and S_REGION = 'AMERICA' during matrix construction +# P_CATEGORY = 'MFGR#12' : 2 (Our encoded value) +p_cat_filt = raSel::m_raSelection(part_matrix_min, col=2, op="==", val=2); + +# S_REGION = 'AMERICA' : 1 (Our encoded value) +s_reg_filt = raSel::m_raSelection(sup_matrix_min, col=2, op="==", val=1); + +# -- JOIN TABLES WITH RA-JOIN FUNCTION -- +# JOINING MINIMIZED LINEORDER TABLE WITH FILTERED PART TABLE WHERE LO_PARTKEY = P_PARTKEY +lo_part = raJoin::m_raJoin(A=lineorder_matrix_min, colA=1, B=p_cat_filt, colB=1, method="sort-merge"); + +# JOIN: ⨝ SUPPLIER WHERE LO_SUPPKEY = S_SUPPKEY +lo_part_sup = raJoin::m_raJoin(A=lo_part, colA=2, B=s_reg_filt, colB=1, method="sort-merge"); + +# JOIN: ⨝ DATE WHERE LO_ORDERDATE = D_DATEKEY +joined_matrix = raJoin::m_raJoin(A=lo_part_sup, colA=3, B=date_matrix_min, colB=1, method="sort-merge"); + +# -- GROUP-BY & AGGREGATION -- +# LO_REVENUE : COLUMN 4 OF LINEORDER-MIN-MATRIX +revenue = joined_matrix[, 4]; +# D_YEAR : COLUMN 2 OF DATE-MIN-MATRIX +d_year = joined_matrix[,(ncol(lineorder_matrix_min) + ncol(part_matrix_min) + ncol(sup_matrix_min) + 2)]; +# P_BRAND : COLUMN 3 OF PART-MIN-MATRIX +p_brand = joined_matrix[,(ncol(lineorder_matrix_min) + 3)]; + +max_p_brand = max(p_brand); +p_brand_scale_f = ceil(max_p_brand) + 1; + +combined_key = d_year * p_brand_scale_f + p_brand; + +group_input = cbind(revenue, combined_key); +agg_result = raGrp::m_raGroupby(X=group_input, col=2, method="nested-loop"); + +gr_key = agg_result[, 1]; +revenue = rowSums(agg_result[, 2:ncol(agg_result)]); + +p_brand = round(gr_key %% p_brand_scale_f); +d_year = round((gr_key - p_brand) / p_brand_scale_f); + +result = cbind(revenue, d_year, p_brand); + +result_ordered = order(target=result, by=1, decreasing=FALSE, index.return=FALSE); + +print("Processing " + nrow(result_ordered) + " result rows..."); + +# Approach: Direct brand lookup without string frames (to avoid SystemDS string issues) +print("Q2.1 Results with brand names (avoiding string frame issues):"); + +# Output results with direct lookup - no intermediate string storage +for (i in 1:nrow(result_ordered)) { + revenue_val = as.scalar(result_ordered[i, 1]); + year_val = as.scalar(result_ordered[i, 2]); + brand_code = as.scalar(result_ordered[i, 3]); + + # Map brand code back to brand name + brand_code = as.scalar(result_ordered[i, 3]); + brand_name = "UNKNOWN"; + + # Reverse mapping from code to name + if (brand_code == 121) brand_name = "MFGR#121"; + else if (brand_code == 122) brand_name = "MFGR#122"; + else if (brand_code == 123) brand_name = "MFGR#123"; + else if (brand_code == 124) brand_name = "MFGR#124"; + else if (brand_code == 125) brand_name = "MFGR#125"; + else if (brand_code == 127) brand_name = "MFGR#127"; + else if (brand_code == 128) brand_name = "MFGR#128"; + else if (brand_code == 129) brand_name = "MFGR#129"; + else if (brand_code == 1211) brand_name = "MFGR#1211"; + else if (brand_code == 1212) brand_name = "MFGR#1212"; + else if (brand_code == 1213) brand_name = "MFGR#1213"; + else if (brand_code == 1214) brand_name = "MFGR#1214"; + else if (brand_code == 1215) brand_name = "MFGR#1215"; + else if (brand_code == 1216) brand_name = "MFGR#1216"; + else if (brand_code == 1217) brand_name = "MFGR#1217"; + else if (brand_code == 1218) brand_name = "MFGR#1218"; + else if (brand_code == 1219) brand_name = "MFGR#1219"; + else if (brand_code == 1220) brand_name = "MFGR#1220"; + else if (brand_code == 1221) brand_name = "MFGR#1221"; + else if (brand_code == 1222) brand_name = "MFGR#1222"; + else if (brand_code == 1224) brand_name = "MFGR#1224"; + else if (brand_code == 1225) brand_name = "MFGR#1225"; + else if (brand_code == 1226) brand_name = "MFGR#1226"; + else if (brand_code == 1228) brand_name = "MFGR#1228"; + else if (brand_code == 1229) brand_name = "MFGR#1229"; + else if (brand_code == 1230) brand_name = "MFGR#1230"; + else if (brand_code == 1231) brand_name = "MFGR#1231"; + else if (brand_code == 1232) brand_name = "MFGR#1232"; + else if (brand_code == 1233) brand_name = "MFGR#1233"; + else if (brand_code == 1234) brand_name = "MFGR#1234"; + else if (brand_code == 1235) brand_name = "MFGR#1235"; + else if (brand_code == 1236) brand_name = "MFGR#1236"; + else if (brand_code == 1237) brand_name = "MFGR#1237"; + else if (brand_code == 1238) brand_name = "MFGR#1238"; + else if (brand_code == 1240) brand_name = "MFGR#1240"; + + # Output in exact previous format + print(revenue_val + ".000 " + year_val + ".000 " + brand_name); +} + +# Frame format output +print(""); +print("# FRAME: nrow = " + nrow(result_ordered) + ", ncol = 3"); +print("# C1 C2 C3"); +print("# INT32 INT32 STRING"); + +for (i in 1:nrow(result_ordered)) { + revenue_val = as.scalar(result_ordered[i, 1]); + year_val = as.scalar(result_ordered[i, 2]); + brand_code = as.scalar(result_ordered[i, 3]); + + # Same brand code mapping for frame output + brand_code = as.scalar(result_ordered[i, 3]); + brand_name = "UNKNOWN"; + + if (brand_code == 121) brand_name = "MFGR#121"; + else if (brand_code == 122) brand_name = "MFGR#122"; + else if (brand_code == 123) brand_name = "MFGR#123"; + else if (brand_code == 124) brand_name = "MFGR#124"; + else if (brand_code == 125) brand_name = "MFGR#125"; + else if (brand_code == 127) brand_name = "MFGR#127"; + else if (brand_code == 128) brand_name = "MFGR#128"; + else if (brand_code == 129) brand_name = "MFGR#129"; + else if (brand_code == 1211) brand_name = "MFGR#1211"; + else if (brand_code == 1212) brand_name = "MFGR#1212"; + else if (brand_code == 1213) brand_name = "MFGR#1213"; + else if (brand_code == 1214) brand_name = "MFGR#1214"; + else if (brand_code == 1215) brand_name = "MFGR#1215"; + else if (brand_code == 1216) brand_name = "MFGR#1216"; + else if (brand_code == 1217) brand_name = "MFGR#1217"; + else if (brand_code == 1218) brand_name = "MFGR#1218"; + else if (brand_code == 1219) brand_name = "MFGR#1219"; + else if (brand_code == 1220) brand_name = "MFGR#1220"; + else if (brand_code == 1221) brand_name = "MFGR#1221"; + else if (brand_code == 1222) brand_name = "MFGR#1222"; + else if (brand_code == 1224) brand_name = "MFGR#1224"; + else if (brand_code == 1225) brand_name = "MFGR#1225"; + else if (brand_code == 1226) brand_name = "MFGR#1226"; + else if (brand_code == 1228) brand_name = "MFGR#1228"; + else if (brand_code == 1229) brand_name = "MFGR#1229"; + else if (brand_code == 1230) brand_name = "MFGR#1230"; + else if (brand_code == 1231) brand_name = "MFGR#1231"; + else if (brand_code == 1232) brand_name = "MFGR#1232"; + else if (brand_code == 1233) brand_name = "MFGR#1233"; + else if (brand_code == 1234) brand_name = "MFGR#1234"; + else if (brand_code == 1235) brand_name = "MFGR#1235"; + else if (brand_code == 1236) brand_name = "MFGR#1236"; + else if (brand_code == 1237) brand_name = "MFGR#1237"; + else if (brand_code == 1238) brand_name = "MFGR#1238"; + else if (brand_code == 1240) brand_name = "MFGR#1240"; + + print(revenue_val + " " + year_val + " " + brand_name); +} \ No newline at end of file diff --git a/scripts/ssb/queries/q2_2.dml b/scripts/ssb/queries/q2_2.dml new file mode 100644 index 00000000000..bfc1720587f --- /dev/null +++ b/scripts/ssb/queries/q2_2.dml @@ -0,0 +1,224 @@ +/*DML-script implementing the ssb query Q2.2 in SystemDS. +SELECT SUM(lo_revenue), d_year, p_brand +FROM lineorder, dates, part, supplier +WHERE + lo_orderdate = d_datekey + AND lo_partkey = p_partkey + AND lo_suppkey = s_suppkey + AND p_brand BETWEEN 'MFGR#2221' AND 'MFGR#2228' + AND s_region = 'ASIA' +GROUP BY d_year, p_brand +ORDER BY d_year, p_brand; + +Usage: +./bin/systemds scripts/ssb/queries/q2_2.dml -nvargs input_dir="/path/to/data" +./bin/systemds scripts/ssb/queries/q2_2.dml -nvargs input_dir="/Users/ghafekalsaho/Desktop/data" +or with explicit -f flag: +./bin/systemds -f scripts/ssb/queries/q2_2.dml -nvargs input_dir="/path/to/data" + +Parameters: +input_dir - Path to input directory containing the table files (e.g., ./data) +*/ + +# -- SOURCING THE RA-FUNCTIONS -- +source("./scripts/builtin/raSelection.dml") as raSel +source("./scripts/builtin/raJoin.dml") as raJoin +source("./scripts/builtin/raGroupby.dml") as raGrp + +# -- PARAMETER HANDLING -- +input_dir = ifdef($input_dir, "./data"); + +# -- READING INPUT FILES -- +# CSV TABLES +date_csv = read(input_dir + "/date.tbl", data_type="frame", format="csv", header=FALSE, sep="|"); +lineorder_csv = read(input_dir + "/lineorder.tbl", data_type="frame", format="csv", header=FALSE, sep="|"); +part_csv = read(input_dir + "/part.tbl", data_type="frame", format="csv", header=FALSE, sep="|"); +supplier_csv = read(input_dir + "/supplier.tbl", data_type="frame", format="csv", header=FALSE, sep="|"); + +# -- PREPARING -- +# Optimized approach: On-the-fly filtering with direct matrix construction for string fields + +# EXTRACTING MINIMAL DATE DATA TO OPTIMIZE RUNTIME => COL-1 : DATE-KEY | COL-5 : D_YEAR +date_csv_min = cbind(date_csv[, 1], date_csv[, 5]); +date_matrix_min = as.matrix(date_csv_min); + +# EXTRACTING MINIMAL LINEORDER DATA TO OPTIMIZE RUNTIME => COL-4 : LO_PARTKEY | COL-5 : LO_SUPPKEY | +# COL-6 : LO_ORDERDATE | COL-13 : LO_REVENUE +lineorder_csv_min = cbind(lineorder_csv[, 4], lineorder_csv[, 5], lineorder_csv[, 6], lineorder_csv[, 13]); +lineorder_matrix_min = as.matrix(lineorder_csv_min); + +# ON-THE-FLY PART TABLE FILTERING AND ENCODING (P_BRAND BETWEEN 'MFGR#2221' AND 'MFGR#2228') +# Two-pass approach: Count first, then filter and encode +part_keys_matrix = as.matrix(part_csv[, 1]); # part_key +part_nrows = nrow(part_csv); +valid_brands_count = 0; + +# Pass 1: Count matching parts (brands between MFGR#2221 and MFGR#2228) +for (i in 1:part_nrows) { + brand_val = as.scalar(part_csv[i, 5]); # p_brand + if (brand_val >= "MFGR#2221" & brand_val <= "MFGR#2228") { + valid_brands_count = valid_brands_count + 1; + } +} + +# Pass 2: Build part matrix with proper brand encoding +part_matrix_min = matrix(0, valid_brands_count, 2); # partkey, brand_code +filtered_idx = 0; + +for (i in 1:part_nrows) { + brand_val = as.scalar(part_csv[i, 5]); # p_brand + if (brand_val >= "MFGR#2221" & brand_val <= "MFGR#2228") { + filtered_idx = filtered_idx + 1; + + # Encode brand names to numeric codes for efficient processing (using original metadata codes) + brand_code = 0; + if (brand_val == "MFGR#2221") brand_code = 453; + else if (brand_val == "MFGR#2222") brand_code = 597; + else if (brand_val == "MFGR#2223") brand_code = 907; + else if (brand_val == "MFGR#2224") brand_code = 282; + else if (brand_val == "MFGR#2225") brand_code = 850; + else if (brand_val == "MFGR#2226") brand_code = 525; + else if (brand_val == "MFGR#2227") brand_code = 538; + else if (brand_val == "MFGR#2228") brand_code = 608; + else brand_code = 9999; # fallback for unknown brands in range + + part_matrix_min[filtered_idx, 1] = as.scalar(part_keys_matrix[i, 1]); # part_key + part_matrix_min[filtered_idx, 2] = brand_code; # brand code + } +} + +# ON-THE-FLY SUPPLIER TABLE FILTERING AND ENCODING (S_REGION = 'ASIA') +# Two-pass approach for suppliers +supplier_keys_matrix = as.matrix(supplier_csv[, 1]); # supplier_key +supplier_nrows = nrow(supplier_csv); +asia_count = 0; + +# Pass 1: Count matching suppliers +for (i in 1:supplier_nrows) { + region_val = as.scalar(supplier_csv[i, 6]); # s_region + if (region_val == "ASIA") { + asia_count = asia_count + 1; + } +} + +# Pass 2: Build supplier matrix +sup_matrix_min = matrix(0, asia_count, 2); # suppkey, region_encoded +filtered_idx = 0; +for (i in 1:supplier_nrows) { + region_val = as.scalar(supplier_csv[i, 6]); # s_region + if (region_val == "ASIA") { + filtered_idx = filtered_idx + 1; + sup_matrix_min[filtered_idx, 1] = as.scalar(supplier_keys_matrix[i, 1]); # supplier_key + sup_matrix_min[filtered_idx, 2] = 5; # encoded value for ASIA + } +} + +# -- FILTERING THE DATA WITH RA-SELECTION FUNCTION -- +# We already filtered during matrix construction, but we can use RA selection for consistency +# All parts in part_matrix_min are already filtered for brands between MFGR#2221 and MFGR#2228 +p_brand_filt = part_matrix_min; # Already filtered + +# S_REGION = 'ASIA' : 5 (Our encoded value) +s_reg_filt = raSel::m_raSelection(sup_matrix_min, col=2, op="==", val=5); + +# -- JOIN TABLES WITH RA-JOIN FUNCTION -- +# JOINING MINIMIZED LINEORDER TABLE WITH FILTERED PART TABLE WHERE LO_PARTKEY = P_PARTKEY +lo_part = raJoin::m_raJoin(A=lineorder_matrix_min, colA=1, B=p_brand_filt, colB=1, method="sort-merge"); + +# JOIN: ⨝ SUPPLIER WHERE LO_SUPPKEY = S_SUPPKEY +lo_part_sup = raJoin::m_raJoin(A=lo_part, colA=2, B=s_reg_filt, colB=1, method="sort-merge"); + +# JOIN: ⨝ DATE WHERE LO_ORDERDATE = D_DATEKEY +joined_matrix = raJoin::m_raJoin(A=lo_part_sup, colA=3, B=date_matrix_min, colB=1, method="sort-merge"); + +# -- GROUP-BY & AGGREGATION -- +# LO_REVENUE : COLUMN 4 OF LINEORDER-MIN-MATRIX +revenue = joined_matrix[, 4]; +# D_YEAR : COLUMN 2 OF DATE-MIN-MATRIX +d_year = joined_matrix[,(ncol(lineorder_matrix_min) + ncol(part_matrix_min) + ncol(sup_matrix_min) + 2)]; +# P_BRAND : COLUMN 2 OF PART-MIN-MATRIX +p_brand = joined_matrix[,(ncol(lineorder_matrix_min) + 2)]; + +max_p_brand = max(p_brand); +p_brand_scale_f = ceil(max_p_brand) + 1; + +combined_key = d_year * p_brand_scale_f + p_brand; + +group_input = cbind(revenue, combined_key); +agg_result = raGrp::m_raGroupby(X=group_input, col=2, method="nested-loop"); + +gr_key = agg_result[, 1]; +revenue = rowSums(agg_result[, 2:ncol(agg_result)]); + +p_brand = round(gr_key %% p_brand_scale_f); +d_year = round((gr_key - p_brand) / p_brand_scale_f); + +result = cbind(revenue, d_year, p_brand); + +result_ordered = order(target=result, by=3, decreasing=FALSE, index.return=FALSE); # 3 : P_BRAND +result_ordered = order(target=result_ordered, by=2, decreasing=FALSE, index.return=FALSE); # D_YEAR + +print("Processing " + nrow(result_ordered) + " result rows..."); + +# Output results with brand codes (matching original format) +print("Q2.2 Results with brand codes:"); + +for (i in 1:nrow(result_ordered)) { + revenue_val = as.scalar(result_ordered[i, 1]); + year_val = as.scalar(result_ordered[i, 2]); + brand_code = as.scalar(result_ordered[i, 3]); + + # Output in original format with brand codes + print(revenue_val + ".000 " + year_val + ".000 " + brand_code + ".000"); +} + +# Calculate and print total revenue +total_revenue = sum(result_ordered[, 1]); +print(""); +print("REVENUE: " + as.integer(total_revenue)); +print(""); + +for (i in 1:nrow(result_ordered)) { + revenue_val = as.scalar(result_ordered[i, 1]); + year_val = as.scalar(result_ordered[i, 2]); + brand_code = as.scalar(result_ordered[i, 3]); + + # Map brand code back to brand name (using original metadata codes) + brand_name = "UNKNOWN"; + if (brand_code == 453) brand_name = "MFGR#2221"; + else if (brand_code == 597) brand_name = "MFGR#2222"; + else if (brand_code == 907) brand_name = "MFGR#2223"; + else if (brand_code == 282) brand_name = "MFGR#2224"; + else if (brand_code == 850) brand_name = "MFGR#2225"; + else if (brand_code == 525) brand_name = "MFGR#2226"; + else if (brand_code == 538) brand_name = "MFGR#2227"; + else if (brand_code == 608) brand_name = "MFGR#2228"; + + # Output in consistent format + print(revenue_val + ".000 " + year_val + ".000 " + brand_name); +} + +# Frame format output +print(""); +print("# FRAME: nrow = " + nrow(result_ordered) + ", ncol = 3"); +print("# C1 C2 C3"); +print("# INT32 INT32 STRING"); + +for (i in 1:nrow(result_ordered)) { + revenue_val = as.scalar(result_ordered[i, 1]); + year_val = as.scalar(result_ordered[i, 2]); + brand_code = as.scalar(result_ordered[i, 3]); + + # Same brand code mapping for frame output (using original metadata codes) + brand_name = "UNKNOWN"; + if (brand_code == 453) brand_name = "MFGR#2221"; + else if (brand_code == 597) brand_name = "MFGR#2222"; + else if (brand_code == 907) brand_name = "MFGR#2223"; + else if (brand_code == 282) brand_name = "MFGR#2224"; + else if (brand_code == 850) brand_name = "MFGR#2225"; + else if (brand_code == 525) brand_name = "MFGR#2226"; + else if (brand_code == 538) brand_name = "MFGR#2227"; + else if (brand_code == 608) brand_name = "MFGR#2228"; + + print(revenue_val + " " + year_val + " " + brand_name); +} diff --git a/scripts/ssb/queries/q2_3.dml b/scripts/ssb/queries/q2_3.dml new file mode 100644 index 00000000000..40630f471a2 --- /dev/null +++ b/scripts/ssb/queries/q2_3.dml @@ -0,0 +1,199 @@ +/*DML-script implementing the ssb query Q2.3 in SystemDS. +SELECT SUM(lo_revenue), d_year, p_brand +FROM lineorder, dates, part, supplier +WHERE + lo_orderdate = d_datekey + AND lo_partkey = p_partkey + AND lo_suppkey = s_suppkey + AND p_brand = 'MFGR#2239' + AND s_region = 'EUROPE' +GROUP BY d_year, p_brand +ORDER BY d_year, p_brand; + +Usage: +./bin/systemds scripts/ssb/queries/q2_3.dml -nvargs input_dir="/path/to/data" +./bin/systemds scripts/ssb/queries/q2_3.dml -nvargs input_dir="/Users/ghafekalsaho/Desktop/data" +or with explicit -f flag: +./bin/systemds -f scripts/ssb/queries/q2_3.dml -nvargs input_dir="/path/to/data" + +Parameters: +input_dir - Path to input directory containing the table files (e.g., ./data) +*/ + +# -- SOURCING THE RA-FUNCTIONS -- +source("./scripts/builtin/raSelection.dml") as raSel +source("./scripts/builtin/raJoin.dml") as raJoin +source("./scripts/builtin/raGroupby.dml") as raGrp + +# -- PARAMETER HANDLING -- +input_dir = ifdef($input_dir, "./data"); + +# -- READING INPUT FILES -- +# CSV TABLES +date_csv = read(input_dir + "/date.tbl", data_type="frame", format="csv", header=FALSE, sep="|"); +lineorder_csv = read(input_dir + "/lineorder.tbl", data_type="frame", format="csv", header=FALSE, sep="|"); +part_csv = read(input_dir + "/part.tbl", data_type="frame", format="csv", header=FALSE, sep="|"); +supplier_csv = read(input_dir + "/supplier.tbl", data_type="frame", format="csv", header=FALSE, sep="|"); + + +# -- PREPARING -- +# Optimized approach: On-the-fly filtering with direct matrix construction for string fields + +# EXTRACTING MINIMAL DATE DATA TO OPTIMIZE RUNTIME => COL-1 : DATE-KEY | COL-5 : D_YEAR +date_csv_min = cbind(date_csv[, 1], date_csv[, 5]); +date_matrix_min = as.matrix(date_csv_min); + +# EXTRACTING MINIMAL LINEORDER DATA TO OPTIMIZE RUNTIME => COL-4 : LO_PARTKEY | COL-5 : LO_SUPPKEY | +# COL-6 : LO_ORDERDATE | COL-13 : LO_REVENUE +lineorder_csv_min = cbind(lineorder_csv[, 4], lineorder_csv[, 5], lineorder_csv[, 6], lineorder_csv[, 13]); +lineorder_matrix_min = as.matrix(lineorder_csv_min); + +# ON-THE-FLY PART TABLE FILTERING AND ENCODING (P_BRAND = 'MFGR#2239') +# Two-pass approach: Count first, then filter and encode +part_keys_matrix = as.matrix(part_csv[, 1]); # part_key +part_nrows = nrow(part_csv); +mfgr2239_count = 0; + +# Pass 1: Count matching parts (brand = MFGR#2239) +for (i in 1:part_nrows) { + brand_val = as.scalar(part_csv[i, 5]); # p_brand + if (brand_val == "MFGR#2239") { + mfgr2239_count = mfgr2239_count + 1; + } +} + +# Pass 2: Build part matrix with proper brand encoding (using original metadata code) +part_matrix_min = matrix(0, mfgr2239_count, 2); # partkey, brand_code +filtered_idx = 0; + +for (i in 1:part_nrows) { + brand_val = as.scalar(part_csv[i, 5]); # p_brand + if (brand_val == "MFGR#2239") { + filtered_idx = filtered_idx + 1; + part_matrix_min[filtered_idx, 1] = as.scalar(part_keys_matrix[i, 1]); # part_key + part_matrix_min[filtered_idx, 2] = 381; # encoded value for MFGR#2239 (from original metadata) + } +} + +# ON-THE-FLY SUPPLIER TABLE FILTERING AND ENCODING (S_REGION = 'EUROPE') +# Two-pass approach for suppliers +supplier_keys_matrix = as.matrix(supplier_csv[, 1]); # supplier_key +supplier_nrows = nrow(supplier_csv); +europe_count = 0; + +# Pass 1: Count matching suppliers +for (i in 1:supplier_nrows) { + region_val = as.scalar(supplier_csv[i, 6]); # s_region + if (region_val == "EUROPE") { + europe_count = europe_count + 1; + } +} + +# Pass 2: Build supplier matrix +sup_matrix_min = matrix(0, europe_count, 2); # suppkey, region_encoded +filtered_idx = 0; +for (i in 1:supplier_nrows) { + region_val = as.scalar(supplier_csv[i, 6]); # s_region + if (region_val == "EUROPE") { + filtered_idx = filtered_idx + 1; + sup_matrix_min[filtered_idx, 1] = as.scalar(supplier_keys_matrix[i, 1]); # supplier_key + sup_matrix_min[filtered_idx, 2] = 4; # encoded value for EUROPE (from original metadata) + } +} + +# -- FILTERING THE DATA WITH RA-SELECTION FUNCTION -- +# We already filtered during matrix construction, but we can use RA selection for consistency +# P_BRAND = 'MFGR#2239' : 381 (Our encoded value) +p_brand_filt = raSel::m_raSelection(part_matrix_min, col=2, op="==", val=381); + +# S_REGION = 'EUROPE' : 4 (Our encoded value) +s_reg_filt = raSel::m_raSelection(sup_matrix_min, col=2, op="==", val=4); + + +# -- JOIN TABLES WITH RA-JOIN FUNCTION -- +# JOINING MINIMIZED LINEORDER TABLE WITH FILTERED PART TABLE WHERE LO_PARTKEY = P_PARTKEY +lo_part = raJoin::m_raJoin(A=lineorder_matrix_min, colA=1, B=p_brand_filt, colB=1, method="sort-merge"); + +# JOIN: ⨝ SUPPLIER WHERE LO_SUPPKEY = S_SUPPKEY +lo_part_sup = raJoin::m_raJoin(A=lo_part, colA=2, B=s_reg_filt, colB=1, method="sort-merge"); + +# JOIN: ⨝ DATE WHERE LO_ORDERDATE = D_DATEKEY +joined_matrix = raJoin::m_raJoin(A=lo_part_sup, colA=3, B=date_matrix_min, colB=1, method="sort-merge"); + +# -- GROUP-BY & AGGREGATION -- +# LO_REVENUE : COLUMN 4 OF LINEORDER-MIN-MATRIX +revenue = joined_matrix[, 4]; +# D_YEAR : COLUMN 2 OF DATE-MIN-MATRIX +d_year = joined_matrix[,(ncol(lineorder_matrix_min) + ncol(part_matrix_min) + ncol(sup_matrix_min) + 2)]; +# P_BRAND : COLUMN 2 OF PART-MIN-MATRIX +p_brand = joined_matrix[,(ncol(lineorder_matrix_min) + 2)]; + +max_p_brand = max(p_brand); +p_brand_scale_f = ceil(max_p_brand) + 1; + +combined_key = d_year * p_brand_scale_f + p_brand; + +group_input = cbind(revenue, combined_key); +agg_result = raGrp::m_raGroupby(X=group_input, col=2, method="nested-loop"); + +gr_key = agg_result[, 1]; +revenue = rowSums(agg_result[, 2:ncol(agg_result)]); + +p_brand = round(gr_key %% p_brand_scale_f); +d_year = round((gr_key - p_brand) / p_brand_scale_f); + +result = cbind(revenue, d_year, p_brand); + +result_ordered = order(target=result, by=3, decreasing=FALSE, index.return=FALSE); # 3 : P_BRAND +result_ordered = order(target=result_ordered, by=2, decreasing=FALSE, index.return=FALSE); # D_YEAR + +print("Processing " + nrow(result_ordered) + " result rows..."); + +# Output results with brand codes (matching original format) +print("Q2.3 Results with brand codes:"); + +for (i in 1:nrow(result_ordered)) { + revenue_val = as.scalar(result_ordered[i, 1]); + year_val = as.scalar(result_ordered[i, 2]); + brand_code = as.scalar(result_ordered[i, 3]); + + # Output in original format with brand codes + print(revenue_val + ".000 " + year_val + ".000 " + brand_code + ".000"); +} + +# Calculate and print total revenue +total_revenue = sum(result_ordered[, 1]); +print(""); +print("REVENUE: " + as.integer(total_revenue)); +print(""); + +for (i in 1:nrow(result_ordered)) { + revenue_val = as.scalar(result_ordered[i, 1]); + year_val = as.scalar(result_ordered[i, 2]); + brand_code = as.scalar(result_ordered[i, 3]); + + # Map brand code back to brand name (using original metadata code) + brand_name = "UNKNOWN"; + if (brand_code == 381) brand_name = "MFGR#2239"; + + # Output in consistent format + print(revenue_val + ".000 " + year_val + ".000 " + brand_name); +} + +# Frame format output +print(""); +print("# FRAME: nrow = " + nrow(result_ordered) + ", ncol = 3"); +print("# C1 C2 C3"); +print("# INT32 INT32 STRING"); + +for (i in 1:nrow(result_ordered)) { + revenue_val = as.scalar(result_ordered[i, 1]); + year_val = as.scalar(result_ordered[i, 2]); + brand_code = as.scalar(result_ordered[i, 3]); + + # Same brand code mapping for frame output + brand_name = "UNKNOWN"; + if (brand_code == 381) brand_name = "MFGR#2239"; + + print(revenue_val + " " + year_val + " " + brand_name); +} diff --git a/scripts/ssb/queries/q3_1.dml b/scripts/ssb/queries/q3_1.dml new file mode 100644 index 00000000000..93c9fbcb57c --- /dev/null +++ b/scripts/ssb/queries/q3_1.dml @@ -0,0 +1,271 @@ +/*DML-script implementing the ssb query Q3.1 in SystemDS. +SELECT + c_nation, + s_nation, + d_year, + SUM(lo_revenue) AS REVENUE +FROM customer, lineorder, supplier, dates +WHERE + lo_custkey = c_custkey + AND lo_suppkey = s_suppkey + AND lo_orderdate = d_datekey + AND c_region = 'ASIA' + AND s_region = 'ASIA' + AND d_year >= 1992 + AND d_year <= 1997 +GROUP BY c_nation, s_nation, d_year +ORDER BY d_year ASC, REVENUE DESC; + +Usage: +./bin/systemds scripts/ssb/queries/q3_1.dml -nvargs input_dir="/path/to/data" +./bin/systemds scripts/ssb/queries/q3_1.dml -nvargs input_dir="/Users/ghafekalsaho/Desktop/data" +or with explicit -f flag: +./bin/systemds -f scripts/ssb/queries/q3_1.dml -nvargs input_dir="/path/to/data" + +Parameters: +input_dir - Path to input directory containing the table files (e.g., ./data) +*/ + +# -- SOURCING THE RA-FUNCTIONS -- +source("./scripts/builtin/raSelection.dml") as raSel +source("./scripts/builtin/raJoin.dml") as raJoin +source("./scripts/builtin/raGroupby.dml") as raGrp + +# -- PARAMETER HANDLING -- +input_dir = ifdef($input_dir, "./data"); + + +# -- READING INPUT FILES -- +# CSV TABLES +date_csv = read(input_dir + "/date.tbl", data_type="frame", format="csv", header=FALSE, sep="|"); +lineorder_csv = read(input_dir + "/lineorder.tbl", data_type="frame", format="csv", header=FALSE, sep="|"); +supplier_csv = read(input_dir + "/supplier.tbl", data_type="frame", format="csv", header=FALSE, sep="|"); +customer_csv = read(input_dir + "/customer.tbl", data_type="frame", format="csv", header=FALSE, sep="|"); + + +# -- PREPARING -- +# Optimized approach: On-the-fly filtering with direct matrix construction for string fields + +# EXTRACTING MINIMAL DATE DATA TO OPTIMIZE RUNTIME => COL-1 : DATE-KEY | COL-5 : D_YEAR +date_csv_min = cbind(date_csv[, 1], date_csv[, 5]); +date_matrix_min = as.matrix(date_csv_min); + +# EXTRACTING MINIMAL LINEORDER DATA TO OPTIMIZE RUNTIME => COL-3 : LO_CUSTKEY | COL-5 : LO_SUPPKEY | +# COL-6 : LO_ORDERDATE | COL-13 : LO_REVENUE +lineorder_csv_min = cbind(lineorder_csv[, 3], lineorder_csv[, 5], lineorder_csv[, 6], lineorder_csv[, 13]); +lineorder_matrix_min = as.matrix(lineorder_csv_min); + +# ON-THE-FLY CUSTOMER TABLE FILTERING AND ENCODING (C_REGION = 'ASIA') +# Two-pass approach: Count first, then filter and encode +customer_keys_matrix = as.matrix(customer_csv[, 1]); # customer_key +customer_nrows = nrow(customer_csv); +asia_customer_count = 0; + +# Pass 1: Count matching customers (region = ASIA) +for (i in 1:customer_nrows) { + region_val = as.scalar(customer_csv[i, 6]); # c_region + if (region_val == "ASIA") { + asia_customer_count = asia_customer_count + 1; + } +} + +# Pass 2: Build customer matrix with proper nation and region encoding +cust_matrix_min = matrix(0, asia_customer_count, 3); # custkey, nation_code, region_code +filtered_idx = 0; + +for (i in 1:customer_nrows) { + region_val = as.scalar(customer_csv[i, 6]); # c_region + if (region_val == "ASIA") { + filtered_idx = filtered_idx + 1; + nation_val = as.scalar(customer_csv[i, 5]); # c_nation + + cust_matrix_min[filtered_idx, 1] = as.scalar(customer_keys_matrix[i, 1]); # customer_key + cust_matrix_min[filtered_idx, 3] = 4; # encoded value for ASIA region (from original metadata) + + # Map nation names to codes (using original metadata encodings) + if (nation_val == "CHINA") cust_matrix_min[filtered_idx, 2] = 247; + else if (nation_val == "INDIA") cust_matrix_min[filtered_idx, 2] = 36; + else if (nation_val == "INDONESIA") cust_matrix_min[filtered_idx, 2] = 243; + else if (nation_val == "JAPAN") cust_matrix_min[filtered_idx, 2] = 24; + else if (nation_val == "VIETNAM") cust_matrix_min[filtered_idx, 2] = 230; + else cust_matrix_min[filtered_idx, 2] = -1; # unknown nation + } +} + +# ON-THE-FLY SUPPLIER TABLE FILTERING AND ENCODING (S_REGION = 'ASIA') +# Two-pass approach for suppliers +supplier_keys_matrix = as.matrix(supplier_csv[, 1]); # supplier_key +supplier_nrows = nrow(supplier_csv); +asia_supplier_count = 0; + +# Pass 1: Count matching suppliers +for (i in 1:supplier_nrows) { + region_val = as.scalar(supplier_csv[i, 6]); # s_region + if (region_val == "ASIA") { + asia_supplier_count = asia_supplier_count + 1; + } +} + +# Pass 2: Build supplier matrix +sup_matrix_min = matrix(0, asia_supplier_count, 3); # suppkey, nation_code, region_code +filtered_idx = 0; +for (i in 1:supplier_nrows) { + region_val = as.scalar(supplier_csv[i, 6]); # s_region + if (region_val == "ASIA") { + filtered_idx = filtered_idx + 1; + nation_val = as.scalar(supplier_csv[i, 5]); # s_nation + + sup_matrix_min[filtered_idx, 1] = as.scalar(supplier_keys_matrix[i, 1]); # supplier_key + sup_matrix_min[filtered_idx, 3] = 5; # encoded value for ASIA region (from original metadata) + + # Map nation names to codes (using original metadata encodings) + if (nation_val == "CHINA") sup_matrix_min[filtered_idx, 2] = 27; + else if (nation_val == "INDIA") sup_matrix_min[filtered_idx, 2] = 12; + else if (nation_val == "INDONESIA") sup_matrix_min[filtered_idx, 2] = 48; + else if (nation_val == "JAPAN") sup_matrix_min[filtered_idx, 2] = 73; + else if (nation_val == "VIETNAM") sup_matrix_min[filtered_idx, 2] = 85; + else sup_matrix_min[filtered_idx, 2] = -1; # unknown nation + } +} + + +# -- FILTERING THE DATA WITH RA-SELECTION FUNCTION -- +# We already filtered during matrix construction, but we can use RA selection for consistency +# C_REGION = 'ASIA' : 4 (Our encoded value) +c_reg_filt = raSel::m_raSelection(cust_matrix_min, col=3, op="==", val=4); + +# S_REGION = 'ASIA' : 5 (Our encoded value) +s_reg_filt = raSel::m_raSelection(sup_matrix_min, col=3, op="==", val=5); + +# D_YEAR BETWEEN 1992 & 1997 +d_year_filt = raSel::m_raSelection(date_matrix_min, col=2, op=">=", val=1992); +d_year_filt = raSel::m_raSelection(d_year_filt, col=2, op="<=", val=1997); + + +# -- JOIN TABLES WITH RA-JOIN FUNCTION -- +# JOINING MINIMIZED LINEORDER TABLE WITH FILTERED CUSTOMER TABLE WHERE LO_CUSTKEY = C_CUSTKEY +lo_cust = raJoin::m_raJoin(A=lineorder_matrix_min, colA=1, B=c_reg_filt, colB=1, method="sort-merge"); + +# JOIN: ⨝ SUPPLIER WHERE LO_SUPPKEY = S_SUPPKEY +lo_cust_sup = raJoin::m_raJoin(A=lo_cust, colA=2, B=s_reg_filt, colB=1, method="sort-merge"); + +# JOIN: ⨝ DATE WHERE LO_ORDERDATE = D_DATEKEY +joined_matrix = raJoin::m_raJoin(A=lo_cust_sup, colA=3, B=d_year_filt, colB=1, method="sort-merge"); + + +# -- GROUP-BY & AGGREGATION -- +# LO_REVENUE : COLUMN 4 OF LINEORDER-MIN-MATRIX +revenue = joined_matrix[, 4]; +# D_YEAR : COLUMN 2 OF DATE-MIN-MATRIX +d_year = joined_matrix[,(ncol(lineorder_matrix_min) + ncol(cust_matrix_min) + ncol(sup_matrix_min) + 2)]; +# C_NATION : COLUMN 2 OF CUST-MIN-MATRIX +c_nation = joined_matrix[,(ncol(lineorder_matrix_min) + 2)]; +# S_NATION : COLUMN 2 OF SUP-MIN-MATRIX +s_nation = joined_matrix[,(ncol(lineorder_matrix_min) + ncol(cust_matrix_min) + 2)]; + +# CALCULATING COMBINATION KEY WITH PRIORITY: C_NATION, S_NATION, D_YEAR +max_c_nation = max(c_nation); +max_s_nation = max(s_nation); +max_d_year = max(d_year); + +c_nation_scale_f = ceil(max_c_nation) + 1; +s_nation_scale_f = ceil(max_s_nation) + 1; +d_year_scale_f = ceil(max_d_year) + 1; + +combined_key = c_nation * s_nation_scale_f * d_year_scale_f + s_nation * d_year_scale_f + d_year; + +group_input = cbind(revenue, combined_key); +agg_result = raGrp::m_raGroupby(X=group_input, col=2, method="nested-loop"); + +key = agg_result[, 1]; +revenue = rowSums(agg_result[, 2:ncol(agg_result)]); + +# EXTRACTING C_NATION, S_NATION & D_YEAR +d_year = round(key %% d_year_scale_f); +c_nation = round(floor(key / (s_nation_scale_f * d_year_scale_f))); +s_nation = round((floor(key / d_year_scale_f)) %% s_nation_scale_f); + +result = cbind(c_nation, s_nation, d_year, revenue); + + +# -- SORTING -- +# PRIORITY 1 D_YEAR (ASC), 2 REVENUE (DESC) +result_ordered = order(target=result, by=4, decreasing=TRUE, index.return=FALSE); +result_ordered = order(target=result_ordered, by=3, decreasing=FALSE, index.return=FALSE); + +# -- DECODING C_NATION & S_NATION -- +# Map nation codes back to nation names (using original metadata codes) +print("Processing " + nrow(result_ordered) + " result rows..."); + +print("Q3.1 Results with nation codes:"); +for (i in 1:nrow(result_ordered)) { + c_nation_code = as.scalar(result_ordered[i, 1]); + s_nation_code = as.scalar(result_ordered[i, 2]); + year_val = as.scalar(result_ordered[i, 3]); + revenue_val = as.scalar(result_ordered[i, 4]); + + print(c_nation_code + ".000 " + s_nation_code + ".000 " + year_val + ".000 " + revenue_val + ".000"); +} + +# Calculate and print total revenue +total_revenue = sum(result_ordered[, 4]); +print(""); +print("TOTAL REVENUE: " + as.integer(total_revenue)); +print(""); + +for (i in 1:nrow(result_ordered)) { + c_nation_code = as.scalar(result_ordered[i, 1]); + s_nation_code = as.scalar(result_ordered[i, 2]); + year_val = as.scalar(result_ordered[i, 3]); + revenue_val = as.scalar(result_ordered[i, 4]); + + # Map customer nation codes back to names + c_nation_name = "UNKNOWN"; + if (c_nation_code == 247) c_nation_name = "CHINA"; + else if (c_nation_code == 36) c_nation_name = "INDIA"; + else if (c_nation_code == 243) c_nation_name = "INDONESIA"; + else if (c_nation_code == 24) c_nation_name = "JAPAN"; + else if (c_nation_code == 230) c_nation_name = "VIETNAM"; + + # Map supplier nation codes back to names + s_nation_name = "UNKNOWN"; + if (s_nation_code == 27) s_nation_name = "CHINA"; + else if (s_nation_code == 12) s_nation_name = "INDIA"; + else if (s_nation_code == 48) s_nation_name = "INDONESIA"; + else if (s_nation_code == 73) s_nation_name = "JAPAN"; + else if (s_nation_code == 85) s_nation_name = "VIETNAM"; + + # Output in consistent format + print(c_nation_name + " " + s_nation_name + " " + year_val + ".000 " + revenue_val + ".000"); +} + +# Frame format output +print(""); +print("# FRAME: nrow = " + nrow(result_ordered) + ", ncol = 4"); +print("# C1 C2 C3 C4"); +print("# STRING STRING INT32 INT32"); + +for (i in 1:nrow(result_ordered)) { + c_nation_code = as.scalar(result_ordered[i, 1]); + s_nation_code = as.scalar(result_ordered[i, 2]); + year_val = as.scalar(result_ordered[i, 3]); + revenue_val = as.scalar(result_ordered[i, 4]); + + # Map nation codes to names for frame output + c_nation_name = "UNKNOWN"; + if (c_nation_code == 247) c_nation_name = "CHINA"; + else if (c_nation_code == 36) c_nation_name = "INDIA"; + else if (c_nation_code == 243) c_nation_name = "INDONESIA"; + else if (c_nation_code == 24) c_nation_name = "JAPAN"; + else if (c_nation_code == 230) c_nation_name = "VIETNAM"; + + s_nation_name = "UNKNOWN"; + if (s_nation_code == 27) s_nation_name = "CHINA"; + else if (s_nation_code == 12) s_nation_name = "INDIA"; + else if (s_nation_code == 48) s_nation_name = "INDONESIA"; + else if (s_nation_code == 73) s_nation_name = "JAPAN"; + else if (s_nation_code == 85) s_nation_name = "VIETNAM"; + + print(c_nation_name + " " + s_nation_name + " " + year_val + " " + revenue_val); +} + diff --git a/scripts/ssb/queries/q3_2.dml b/scripts/ssb/queries/q3_2.dml new file mode 100644 index 00000000000..a654b693a0c --- /dev/null +++ b/scripts/ssb/queries/q3_2.dml @@ -0,0 +1,215 @@ +/*DML-script implementing the ssb query Q3.2 in SystemDS. +SELECT + c_city, + s_city, + d_year, + SUM(lo_revenue) AS REVENUE +FROM customer, lineorder, supplier, dates +WHERE + lo_custkey = c_custkey + AND lo_suppkey = s_suppkey + AND lo_orderdate = d_datekey + AND c_nation = 'UNITED STATES' + AND s_nation = 'UNITED STATES' + AND d_year >= 1992 + AND d_year <= 1997 +GROUP BY c_city, s_city, d_year +ORDER BY d_year ASC, REVENUE DESC; + +Usage: +./bin/systemds scripts/ssb/queries/q3_2.dml -nvargs input_dir="/path/to/data" +./bin/systemds scripts/ssb/queries/q3_2.dml -nvargs input_dir="/Users/ghafekalsaho/Desktop/data" + +Parameters: +input_dir - Path to input directory containing the table files (e.g., ./data) +*/ + +# -- SOURCING THE RA-FUNCTIONS -- +source("./scripts/builtin/raSelection.dml") as raSel +source("./scripts/builtin/raJoin.dml") as raJoin +source("./scripts/builtin/raGroupby.dml") as raGrp + +# -- PARAMETER HANDLING -- +input_dir = ifdef($input_dir, "./data"); + + +# -- READING INPUT FILES -- +# CSV TABLES +date_csv = read(input_dir + "/date.tbl", data_type="frame", format="csv", header=FALSE, sep="|"); +lineorder_csv = read(input_dir + "/lineorder.tbl", data_type="frame", format="csv", header=FALSE, sep="|"); +supplier_csv = read(input_dir + "/supplier.tbl", data_type="frame", format="csv", header=FALSE, sep="|"); +customer_csv = read(input_dir + "/customer.tbl", data_type="frame", format="csv", header=FALSE, sep="|"); + +# -- PREPARING -- +# Optimized approach: On-the-fly filtering with direct matrix construction for string fields + +# EXTRACTING MINIMAL DATE DATA TO OPTIMIZE RUNTIME => COL-1 : DATE-KEY | COL-5 : D_YEAR +date_csv_min = cbind(date_csv[, 1], date_csv[, 5]); +date_matrix_min = as.matrix(date_csv_min); + +# EXTRACTING MINIMAL LINEORDER DATA TO OPTIMIZE RUNTIME => COL-3 : LO_CUSTKEY | COL-5 : LO_SUPPKEY | +# COL-6 : LO_ORDERDATE | COL-13 : LO_REVENUE +lineorder_csv_min = cbind(lineorder_csv[, 3], lineorder_csv[, 5], lineorder_csv[, 6], lineorder_csv[, 13]); +lineorder_matrix_min = as.matrix(lineorder_csv_min); + +# ON-THE-FLY CUSTOMER TABLE FILTERING AND ENCODING (C_NATION = 'UNITED STATES') +# Two-pass approach: Count first, then filter and encode +customer_keys_matrix = as.matrix(customer_csv[, 1]); # customer_key +customer_nrows = nrow(customer_csv); +us_customer_count = 0; + +# Pass 1: Count matching customers (nation = UNITED STATES) +for (i in 1:customer_nrows) { + nation_val = as.scalar(customer_csv[i, 5]); # c_nation + if (nation_val == "UNITED STATES") { + us_customer_count = us_customer_count + 1; + } +} + +# Pass 2: Build customer matrix with proper city and nation encoding +cust_matrix_min = matrix(0, us_customer_count, 3); # custkey, city_code, nation_code +filtered_idx = 0; + +for (i in 1:customer_nrows) { + nation_val = as.scalar(customer_csv[i, 5]); # c_nation + if (nation_val == "UNITED STATES") { + filtered_idx = filtered_idx + 1; + city_val = as.scalar(customer_csv[i, 4]); # c_city + + cust_matrix_min[filtered_idx, 1] = as.scalar(customer_keys_matrix[i, 1]); # customer_key + cust_matrix_min[filtered_idx, 3] = 1; # encoded value for UNITED STATES nation + + # Assign city codes dynamically based on city names + # Use filtered index for simple unique encoding + city_code = filtered_idx; + cust_matrix_min[filtered_idx, 2] = city_code; + } +} + +# ON-THE-FLY SUPPLIER TABLE FILTERING AND ENCODING (S_NATION = 'UNITED STATES') +# Two-pass approach for suppliers +supplier_keys_matrix = as.matrix(supplier_csv[, 1]); # supplier_key +supplier_nrows = nrow(supplier_csv); +us_supplier_count = 0; + +# Pass 1: Count matching suppliers +for (i in 1:supplier_nrows) { + nation_val = as.scalar(supplier_csv[i, 5]); # s_nation + if (nation_val == "UNITED STATES") { + us_supplier_count = us_supplier_count + 1; + } +} + +# Pass 2: Build supplier matrix with city encoding (independent from customer cities) +sup_matrix_min = matrix(0, us_supplier_count, 3); # suppkey, city_code, nation_code +filtered_idx = 0; + +for (i in 1:supplier_nrows) { + nation_val = as.scalar(supplier_csv[i, 5]); # s_nation + if (nation_val == "UNITED STATES") { + filtered_idx = filtered_idx + 1; + city_val = as.scalar(supplier_csv[i, 4]); # s_city + + sup_matrix_min[filtered_idx, 1] = as.scalar(supplier_keys_matrix[i, 1]); # supplier_key + sup_matrix_min[filtered_idx, 3] = 1; # encoded value for UNITED STATES nation + + # Assign city codes dynamically based on city names + # Use filtered index for simple unique encoding + city_code = filtered_idx; + sup_matrix_min[filtered_idx, 2] = city_code; + } +} + +# -- FILTERING THE DATA WITH RA-SELECTION FUNCTION -- +# We already filtered during matrix construction, but we can use RA selection for consistency +# C_NATION = 'UNITED STATES' : 1 (Our encoded value) +c_nat_filt = raSel::m_raSelection(cust_matrix_min, col=3, op="==", val=1); + +# S_NATION = 'UNITED STATES' : 1 (Our encoded value) +s_nat_filt = raSel::m_raSelection(sup_matrix_min, col=3, op="==", val=1); + +# D_YEAR BETWEEN 1992 & 1997 +d_year_filt = raSel::m_raSelection(date_matrix_min, col=2, op=">=", val=1992); +d_year_filt = raSel::m_raSelection(d_year_filt, col=2, op="<=", val=1997); + + +# -- JOIN TABLES WITH RA-JOIN FUNCTION -- +# JOINING MINIMIZED LINEORDER TABLE WITH FILTERED CUSTOMER TABLE WHERE LO_CUSTKEY = C_CUSTKEY +lo_cust = raJoin::m_raJoin(A=lineorder_matrix_min, colA=1, B=c_nat_filt, colB=1, method="sort-merge"); + +# JOIN: ⨝ SUPPLIER WHERE LO_SUPPKEY = S_SUPPKEY +lo_cust_sup = raJoin::m_raJoin(A=lo_cust, colA=2, B=s_nat_filt, colB=1, method="sort-merge"); + +# JOIN: ⨝ DATE WHERE LO_ORDERDATE = D_DATEKEY +joined_matrix = raJoin::m_raJoin(A=lo_cust_sup, colA=3, B=d_year_filt, colB=1, method="sort-merge"); + + +# -- GROUP-BY & AGGREGATION -- +# LO_REVENUE : COLUMN 4 OF LINEORDER-MIN-MATRIX (was 5, now 4 since we removed LO_PARTKEY) +revenue = joined_matrix[, 4]; +# D_YEAR : COLUMN 2 OF DATE-MIN-MATRIX +d_year = joined_matrix[,(ncol(lineorder_matrix_min) + ncol(cust_matrix_min) + ncol(sup_matrix_min) + 2)]; +# C_CITY : COLUMN 2 OF CUST-MIN-MATRIX +c_city = joined_matrix[,(ncol(lineorder_matrix_min) + 2)]; +# S_CITY : COLUMN 2 OF SUP-MIN-MATRIX +s_city = joined_matrix[,(ncol(lineorder_matrix_min) + ncol(cust_matrix_min) + 2)]; + +# CALCULATING COMBINATION KEY WITH PRIORITY: C_CITY, S_CITY & D_YEAR +max_c_city = max(c_city); +max_s_city = max(s_city); +max_d_year = max(d_year); + +c_city_scale_f = ceil(max_c_city) + 1; +s_city_scale_f = ceil(max_s_city) + 1; +d_year_scale_f = ceil(max_d_year) + 1; + +combined_key = c_city * s_city_scale_f * d_year_scale_f + s_city * d_year_scale_f + d_year; + +group_input = cbind(revenue, combined_key); +agg_result = raGrp::m_raGroupby(X=group_input, col=2, method="nested-loop"); + +key = agg_result[, 1]; +revenue = rowSums(agg_result[, 2:ncol(agg_result)]); + +# EXTRACTING C_CITY, S_CITY & D_YEAR +d_year = round(key %% d_year_scale_f); +c_city = round(floor(key / (s_city_scale_f * d_year_scale_f))); +s_city = round((floor(key / d_year_scale_f)) %% s_city_scale_f); + +result = cbind(c_city, s_city, d_year, revenue); + + +# -- SORTING -- +# PRIORITY 1 D_YEAR (ASC), 2 REVENUE (DESC) +result_ordered = order(target=result, by=4, decreasing=TRUE, index.return=FALSE); +result_ordered = order(target=result_ordered, by=3, decreasing=FALSE, index.return=FALSE); + + +# -- DECODING C_CITY & S_CITY CODES -- +# For simplicity, we'll output the city codes rather than names +# This follows the same pattern as q3_1.dml which outputs nation codes +print("Q3.2 Results:"); +print("# FRAME: nrow = " + nrow(result_ordered) + ", ncol = 4"); +print("# C1 C2 C3 C4"); +print("# STRING STRING INT32 INT32"); + +for (i in 1:nrow(result_ordered)) { + c_city_code = as.scalar(result_ordered[i, 1]); + s_city_code = as.scalar(result_ordered[i, 2]); + year_val = as.scalar(result_ordered[i, 3]); + revenue_val = as.scalar(result_ordered[i, 4]); + + # For now, output the codes - we can map them back to names later if needed + c_city_name = "UNITED ST" + c_city_code; # Format similar to expected output + s_city_name = "UNITED ST" + s_city_code; # Format similar to expected output + + print(c_city_name + " " + s_city_name + " " + year_val + " " + revenue_val); +} + +# Calculate total revenue for validation +total_revenue = sum(result_ordered[, 4]); +print(""); +print("Total number of result rows: " + nrow(result_ordered)); +print("Total revenue: " + as.integer(total_revenue)); +print("Q3.2 finished"); + diff --git a/scripts/ssb/queries/q3_3.dml b/scripts/ssb/queries/q3_3.dml new file mode 100644 index 00000000000..921fd00b501 --- /dev/null +++ b/scripts/ssb/queries/q3_3.dml @@ -0,0 +1,217 @@ +/* DML-script implementing the ssb query Q3.3 in SystemDS. +SELECT + c_city, + s_city, + d_year, + SUM(lo_revenue) AS REVENUE +FROM customer, lineorder, supplier, dates +WHERE + lo_custkey = c_custkey + AND lo_suppkey = s_suppkey + AND lo_orderdate = d_datekey + AND ( + c_city = 'UNITED KI1' + OR c_city = 'UNITED KI5' + ) + AND ( + s_city = 'UNITED KI1' + OR s_city = 'UNITED KI5' + ) + AND d_year >= 1992 + AND d_year <= 1997 +GROUP BY c_city, s_city, d_year +ORDER BY d_year ASC, REVENUE DESC; +*/ + +# -- PARAMETER HANDLING -- +input_dir = ifdef($input_dir, "./data"); + +# -- SOURCING THE RA-FUNCTIONS -- +source("./scripts/builtin/raSelection.dml") as raSel +source("./scripts/builtin/raJoin.dml") as raJoin +source("./scripts/builtin/raGroupby.dml") as raGrp + + +# -- READING INPUT FILES -- +# CSV TABLES +date_csv = read(input_dir + "/date.tbl", data_type="frame", format="csv", header=FALSE, sep="|"); +lineorder_csv = read(input_dir + "/lineorder.tbl", data_type="frame", format="csv", header=FALSE, sep="|"); +#part_csv = read(input_dir + "/part.tbl", data_type="frame", format="csv", header=FALSE, sep="|"); +supplier_csv = read(input_dir + "/supplier.tbl", data_type="frame", format="csv", header=FALSE, sep="|"); +customer_csv = read(input_dir + "/customer.tbl", data_type="frame", format="csv", header=FALSE, sep="|"); + + +# -- PREPARING -- +# EXTRACTING MINIMAL DATE DATA TO OPTIMIZE RUNTIME => COL-1 : DATE-KEY | COL-5 : D_YEAR +date_csv_min = cbind(date_csv[, 1], date_csv[, 5]); +date_matrix_min = as.matrix(date_csv_min); + +# EXTRACTING MINIMAL LINEORDER DATA TO OPTIMIZE RUNTIME => COL-3 : LO_CUSTKEY | COL-4 : LO_PARTKEY | +# COL-5 : LO_SUPPKEY | COL-6 : LO_ORDERDATE | COL-13 : LO_REVENUE +lineorder_csv_min = cbind(lineorder_csv[, 3], lineorder_csv[, 4], lineorder_csv[, 5], lineorder_csv[, 6], lineorder_csv[, 13]); +lineorder_matrix_min = as.matrix(lineorder_csv_min); + +# ON-THE-FLY CUSTOMER TABLE FILTERING AND ENCODING (C_CITY = 'UNITED KI1' OR 'UNITED KI5') +customer_keys_matrix = as.matrix(customer_csv[, 1]); # customer_key +customer_nrows = nrow(customer_csv); +matching_customer_count = 0; + +# Pass 1: Count matching customers +for (i in 1:customer_nrows) { + city_val = as.scalar(customer_csv[i, 4]); # c_city + if (city_val == "UNITED KI1" | city_val == "UNITED KI5") { + matching_customer_count = matching_customer_count + 1; + } +} + +# Pass 2: Build customer matrix with dynamic city encoding +cust_matrix_min = matrix(0, matching_customer_count, 2); # custkey, city_code +filtered_idx = 0; + +for (i in 1:customer_nrows) { + city_val = as.scalar(customer_csv[i, 4]); # c_city + if (city_val == "UNITED KI1" | city_val == "UNITED KI5") { + filtered_idx = filtered_idx + 1; + cust_matrix_min[filtered_idx, 1] = as.scalar(customer_keys_matrix[i, 1]); # customer_key + + # Use consistent encoding: 1 for UNITED KI1, 2 for UNITED KI5 + if (city_val == "UNITED KI1") { + cust_matrix_min[filtered_idx, 2] = 1; + } else { + cust_matrix_min[filtered_idx, 2] = 2; + } + } +} + +# ON-THE-FLY SUPPLIER TABLE FILTERING AND ENCODING (S_CITY = 'UNITED KI1' OR 'UNITED KI5') +supplier_keys_matrix = as.matrix(supplier_csv[, 1]); # supplier_key +supplier_nrows = nrow(supplier_csv); +matching_supplier_count = 0; + +# Pass 1: Count matching suppliers +for (i in 1:supplier_nrows) { + city_val = as.scalar(supplier_csv[i, 4]); # s_city + if (city_val == "UNITED KI1" | city_val == "UNITED KI5") { + matching_supplier_count = matching_supplier_count + 1; + } +} + +# Pass 2: Build supplier matrix with dynamic city encoding +sup_matrix_min = matrix(0, matching_supplier_count, 2); # suppkey, city_code +filtered_idx = 0; + +for (i in 1:supplier_nrows) { + city_val = as.scalar(supplier_csv[i, 4]); # s_city + if (city_val == "UNITED KI1" | city_val == "UNITED KI5") { + filtered_idx = filtered_idx + 1; + sup_matrix_min[filtered_idx, 1] = as.scalar(supplier_keys_matrix[i, 1]); # supplier_key + + # Use consistent encoding: 1 for UNITED KI1, 2 for UNITED KI5 + if (city_val == "UNITED KI1") { + sup_matrix_min[filtered_idx, 2] = 1; + } else { + sup_matrix_min[filtered_idx, 2] = 2; + } + } +} + + +# -- FILTERING THE DATA WITH RA-SELECTION FUNCTION -- +# Since we already filtered during matrix construction, we can use the full matrices +# or apply additional RA selection if needed for consistency +c_city_filt = cust_matrix_min; # Already filtered for target cities +s_city_filt = sup_matrix_min; # Already filtered for target cities + +# D_YEAR BETWEEN 1992 & 1997 +d_year_filt = raSel::m_raSelection(date_matrix_min, col=2, op=">=", val=1992); +d_year_filt = raSel::m_raSelection(d_year_filt, col=2, op="<=", val=1997); + + +# -- JOIN TABLES WITH RA-JOIN FUNCTION -- +# JOINING MINIMIZED LINEORDER TABLE WITH FILTERED CUSTOMER TABLE WHERE LO_CUSTKEY = C_CUSTKEY +lo_cust = raJoin::m_raJoin(A=lineorder_matrix_min, colA=1, B=c_city_filt, colB=1, method="sort-merge"); + +# JOIN: ⨝ SUPPLIER WHERE LO_SUPPKEY = S_SUPPKEY +lo_cust_sup = raJoin::m_raJoin(A=lo_cust, colA=3, B=s_city_filt, colB=1, method="sort-merge"); + +# JOIN: ⨝ DATE WHERE LO_ORDERDATE = D_DATEKEY +joined_matrix = raJoin::m_raJoin(A=lo_cust_sup, colA=4, B=d_year_filt, colB=1, method="sort-merge"); +#print(nrow(joined_matrix)); + + +# -- GROUP-BY & AGGREGATION -- +# LO_REVENUE : COLUMN 5 OF LINEORDER-MIN-MATRIX +revenue = joined_matrix[, 5]; +# D_YEAR : COLUMN 2 OF DATE-MIN-MATRIX +d_year = joined_matrix[,(ncol(lineorder_matrix_min) + ncol(cust_matrix_min) + ncol(sup_matrix_min) + 2)]; +# C_CITY : COLUMN 2 OF CUST-MIN-MATRIX +c_city = joined_matrix[,(ncol(lineorder_matrix_min) + 2)]; +# S_CITY : COLUMN 2 OF CUST-MIN-MATRIX +s_city = joined_matrix[,(ncol(lineorder_matrix_min) + ncol(cust_matrix_min) + 2)]; + +# CALCULATING COMBINATION KEY WITH PRIORITY: C_CITY, S_CITY & D_YEAR +max_c_city = max(c_city); +max_s_city = max(s_city); +max_d_year = max(d_year); + +c_city_scale_f = ceil(max_c_city) + 1; +s_city_scale_f = ceil(max_s_city) + 1; +d_year_scale_f = ceil(max_d_year) + 1; + +combined_key = c_city * s_city_scale_f * d_year_scale_f + s_city * d_year_scale_f + d_year; + +group_input = cbind(revenue, combined_key); +agg_result = raGrp::m_raGroupby(X=group_input, col=2, method="nested-loop"); + +key = agg_result[, 1]; +revenue = rowSums(agg_result[, 2:ncol(agg_result)]); + +# EXTRACTING C_CITY, S_CITY & D_YEAR +d_year = round(key %% d_year_scale_f); +c_city = round(floor(key / (s_city_scale_f * d_year_scale_f))); +s_city = round((floor(key / d_year_scale_f)) %% s_city_scale_f); + +result = cbind(c_city, s_city, d_year, revenue); + + +# -- SORTING -- +# PRIORITY 1 D_YEAR (ASC), 2 REVENUE (DESC) +result_ordered = order(target=result, by=4, decreasing=TRUE, index.return=FALSE); +result_ordered = order(target=result_ordered, by=3, decreasing=FALSE, index.return=FALSE); + + +# -- OUTPUT RESULTS -- +print("Q3.3 Results:"); +print("# FRAME: nrow = " + nrow(result_ordered) + ", ncol = 4"); +print("# C1 C2 C3 C4"); +print("# STRING STRING INT32 INT32"); + +for (i in 1:nrow(result_ordered)) { + c_city_code = as.scalar(result_ordered[i, 1]); + s_city_code = as.scalar(result_ordered[i, 2]); + year_val = as.scalar(result_ordered[i, 3]); + revenue_val = as.scalar(result_ordered[i, 4]); + + # Map back to original city names based on the encoding used + if (c_city_code == 1) { + c_city_name = "UNITED KI1"; + } else { + c_city_name = "UNITED KI5"; + } + + if (s_city_code == 1) { + s_city_name = "UNITED KI1"; + } else { + s_city_name = "UNITED KI5"; + } + + print(c_city_name + " " + s_city_name + " " + as.integer(year_val) + " " + as.integer(revenue_val)); +} + +# Calculate total revenue for validation +total_revenue = sum(result_ordered[, 4]); +print(""); +print("Total number of result rows: " + nrow(result_ordered)); +print("Total revenue: " + as.integer(total_revenue)); +print("Q3.3 finished"); + diff --git a/scripts/ssb/queries/q3_4.dml b/scripts/ssb/queries/q3_4.dml new file mode 100644 index 00000000000..61327c6dfd7 --- /dev/null +++ b/scripts/ssb/queries/q3_4.dml @@ -0,0 +1,240 @@ +/* DML-script implementing the ssb query Q3.4 in SystemDS. +SELECT + c_city, + s_city, + d_year, + SUM(lo_revenue) AS REVENUE +FROM customer, lineorder, supplier, dates +WHERE + lo_custkey = c_custkey + AND lo_suppkey = s_suppkey + AND lo_orderdate = d_datekey + AND ( + c_city = 'UNITED KI1' + OR c_city = 'UNITED KI5' + ) + AND ( + s_city = 'UNITED KI1' + OR s_city = 'UNITED KI5' + ) + AND d_yearmonth = 'Dec1997' +GROUP BY c_city, s_city, d_year +ORDER BY d_year ASC, REVENUE DESC; +*/ + +# -- PARAMETER HANDLING -- +input_dir = ifdef($input_dir, "./data"); + +# -- SOURCING THE RA-FUNCTIONS -- +source("./scripts/builtin/raSelection.dml") as raSel +source("./scripts/builtin/raJoin.dml") as raJoin +source("./scripts/builtin/raGroupby.dml") as raGrp + + +# -- READING INPUT FILES -- +# CSV TABLES +date_csv = read(input_dir + "/date.tbl", data_type="frame", format="csv", header=FALSE, sep="|"); +lineorder_csv = read(input_dir + "/lineorder.tbl", data_type="frame", format="csv", header=FALSE, sep="|"); +#part_csv = read(input_dir + "/part.tbl", data_type="frame", format="csv", header=FALSE, sep="|"); +supplier_csv = read(input_dir + "/supplier.tbl", data_type="frame", format="csv", header=FALSE, sep="|"); +customer_csv = read(input_dir + "/customer.tbl", data_type="frame", format="csv", header=FALSE, sep="|"); + + +# -- PREPARING -- +# EXTRACTING MINIMAL DATE DATA TO OPTIMIZE RUNTIME => COL-1 : DATE-KEY | COL-5 : D_YEAR +date_csv_min = cbind(date_csv[, 1], date_csv[, 5]); +date_matrix_min = as.matrix(date_csv_min); + +# EXTRACTING MINIMAL LINEORDER DATA TO OPTIMIZE RUNTIME => COL-3 : LO_CUSTKEY | COL-4 : LO_PARTKEY | +# COL-5 : LO_SUPPKEY | COL-6 : LO_ORDERDATE | COL-13 : LO_REVENUE +lineorder_csv_min = cbind(lineorder_csv[, 3], lineorder_csv[, 4], lineorder_csv[, 5], lineorder_csv[, 6], lineorder_csv[, 13]); +lineorder_matrix_min = as.matrix(lineorder_csv_min); + +# ON-THE-FLY CUSTOMER TABLE FILTERING AND ENCODING (C_CITY = 'UNITED KI1' OR 'UNITED KI5') +customer_keys_matrix = as.matrix(customer_csv[, 1]); # customer_key +customer_nrows = nrow(customer_csv); +matching_customer_count = 0; + +# Pass 1: Count matching customers +for (i in 1:customer_nrows) { + city_val = as.scalar(customer_csv[i, 4]); # c_city + if (city_val == "UNITED KI1" | city_val == "UNITED KI5") { + matching_customer_count = matching_customer_count + 1; + } +} + +# Pass 2: Build customer matrix with dynamic city encoding +cust_matrix_min = matrix(0, matching_customer_count, 2); # custkey, city_code +filtered_idx = 0; + +for (i in 1:customer_nrows) { + city_val = as.scalar(customer_csv[i, 4]); # c_city + if (city_val == "UNITED KI1" | city_val == "UNITED KI5") { + filtered_idx = filtered_idx + 1; + cust_matrix_min[filtered_idx, 1] = as.scalar(customer_keys_matrix[i, 1]); # customer_key + + # Use consistent encoding: 1 for UNITED KI1, 2 for UNITED KI5 + if (city_val == "UNITED KI1") { + cust_matrix_min[filtered_idx, 2] = 1; + } else { + cust_matrix_min[filtered_idx, 2] = 2; + } + } +} + +# ON-THE-FLY SUPPLIER TABLE FILTERING AND ENCODING (S_CITY = 'UNITED KI1' OR 'UNITED KI5') +supplier_keys_matrix = as.matrix(supplier_csv[, 1]); # supplier_key +supplier_nrows = nrow(supplier_csv); +matching_supplier_count = 0; + +# Pass 1: Count matching suppliers +for (i in 1:supplier_nrows) { + city_val = as.scalar(supplier_csv[i, 4]); # s_city + if (city_val == "UNITED KI1" | city_val == "UNITED KI5") { + matching_supplier_count = matching_supplier_count + 1; + } +} + +# Pass 2: Build supplier matrix with dynamic city encoding +sup_matrix_min = matrix(0, matching_supplier_count, 2); # suppkey, city_code +filtered_idx = 0; + +for (i in 1:supplier_nrows) { + city_val = as.scalar(supplier_csv[i, 4]); # s_city + if (city_val == "UNITED KI1" | city_val == "UNITED KI5") { + filtered_idx = filtered_idx + 1; + sup_matrix_min[filtered_idx, 1] = as.scalar(supplier_keys_matrix[i, 1]); # supplier_key + + # Use consistent encoding: 1 for UNITED KI1, 2 for UNITED KI5 + if (city_val == "UNITED KI1") { + sup_matrix_min[filtered_idx, 2] = 1; + } else { + sup_matrix_min[filtered_idx, 2] = 2; + } + } +} + + +# -- FILTERING THE DATA WITH RA-SELECTION FUNCTION -- +# Since we already filtered during matrix construction, we can use the full matrices +c_city_filt = cust_matrix_min; # Already filtered for target cities +s_city_filt = sup_matrix_min; # Already filtered for target cities + +# D_YEARMONTH = 'Dec1997' - Need precise filtering for Dec1997 only +# Build filtered date matrix manually since we need string matching on d_yearmonth +date_full_frame = cbind(date_csv[, 1], date_csv[, 5], date_csv[, 7]); # datekey, year, yearmonth +date_nrows = nrow(date_full_frame); +matching_dates = matrix(0, 31, 2); # We know 31 entries exist, store datekey and year +filtered_idx = 0; + +for (i in 1:date_nrows) { + yearmonth_val = as.scalar(date_full_frame[i, 3]); # d_yearmonth + if (yearmonth_val == "Dec1997") { + filtered_idx = filtered_idx + 1; + matching_dates[filtered_idx, 1] = as.scalar(date_matrix_min[i, 1]); # datekey + matching_dates[filtered_idx, 2] = as.scalar(date_matrix_min[i, 2]); # d_year + } +} + +d_year_filt = matching_dates; + + +# -- JOIN TABLES WITH RA-JOIN FUNCTION -- +# JOINING MINIMIZED LINEORDER TABLE WITH FILTERED CUSTOMER TABLE WHERE LO_CUSTKEY = C_CUSTKEY +lo_cust = raJoin::m_raJoin(A=lineorder_matrix_min, colA=1, B=c_city_filt, colB=1, method="sort-merge"); + +# JOIN: ⨝ SUPPLIER WHERE LO_SUPPKEY = S_SUPPKEY +lo_cust_sup = raJoin::m_raJoin(A=lo_cust, colA=3, B=s_city_filt, colB=1, method="sort-merge"); + +# JOIN: ⨝ DATE WHERE LO_ORDERDATE = D_DATEKEY +joined_matrix = raJoin::m_raJoin(A=lo_cust_sup, colA=4, B=d_year_filt, colB=1, method="sort-merge"); + +# Check if we have any results +if (nrow(joined_matrix) == 0) { + print("Q3.4 Results:"); + print("# FRAME: nrow = 0, ncol = 4"); + print("# C1 C2 C3 C4"); + print("# STRING STRING INT32 INT32"); + print(""); + print("Total number of result rows: 0"); + print("Total revenue: 0"); + print("Q3.4 finished - no matching data for Dec1997"); +} else { + + +# -- GROUP-BY & AGGREGATION -- +# LO_REVENUE : COLUMN 5 OF LINEORDER-MIN-MATRIX +revenue = joined_matrix[, 5]; +# D_YEAR : COLUMN 2 OF DATE-MIN-MATRIX +d_year = joined_matrix[,(ncol(lineorder_matrix_min) + ncol(cust_matrix_min) + ncol(sup_matrix_min) + 2)]; +# C_CITY : COLUMN 2 OF CUST-MIN-MATRIX +c_city = joined_matrix[,(ncol(lineorder_matrix_min) + 2)]; +# S_CITY : COLUMN 2 OF CUST-MIN-MATRIX +s_city = joined_matrix[,(ncol(lineorder_matrix_min) + ncol(cust_matrix_min) + 2)]; + +# CALCULATING COMBINATION KEY WITH PRIORITY: C_CITY, S_CITY & D_YEAR +max_c_city = max(c_city); +max_s_city = max(s_city); +max_d_year = max(d_year); + +c_city_scale_f = ceil(max_c_city) + 1; +s_city_scale_f = ceil(max_s_city) + 1; +d_year_scale_f = ceil(max_d_year) + 1; + +combined_key = c_city * s_city_scale_f * d_year_scale_f + s_city * d_year_scale_f + d_year; + +group_input = cbind(revenue, combined_key); +agg_result = raGrp::m_raGroupby(X=group_input, col=2, method="nested-loop"); + +key = agg_result[, 1]; +revenue = rowSums(agg_result[, 2:ncol(agg_result)]); + +# EXTRACTING C_CITY, S_CITY & D_YEAR +d_year = round(key %% d_year_scale_f); +c_city = round(floor(key / (s_city_scale_f * d_year_scale_f))); +s_city = round((floor(key / d_year_scale_f)) %% s_city_scale_f); + +result = cbind(c_city, s_city, d_year, revenue); + + +# -- SORTING -- +# PRIORITY 1 D_YEAR (ASC), 2 REVENUE (DESC) +result_ordered = order(target=result, by=4, decreasing=TRUE, index.return=FALSE); +result_ordered = order(target=result_ordered, by=3, decreasing=FALSE, index.return=FALSE); + + +# -- OUTPUT RESULTS -- +print("Q3.4 Results:"); +print("# FRAME: nrow = " + nrow(result_ordered) + ", ncol = 4"); +print("# C1 C2 C3 C4"); +print("# STRING STRING INT32 INT32"); + +for (i in 1:nrow(result_ordered)) { + c_city_code = as.scalar(result_ordered[i, 1]); + s_city_code = as.scalar(result_ordered[i, 2]); + year_val = as.scalar(result_ordered[i, 3]); + revenue_val = as.scalar(result_ordered[i, 4]); + + # Map back to original city names based on the encoding used + if (c_city_code == 1) { + c_city_name = "UNITED KI1"; + } else { + c_city_name = "UNITED KI5"; + } + + if (s_city_code == 1) { + s_city_name = "UNITED KI1"; + } else { + s_city_name = "UNITED KI5"; + } + + print(c_city_name + " " + s_city_name + " " + as.integer(year_val) + " " + as.integer(revenue_val)); +} + +# Calculate total revenue for validation +total_revenue = sum(result_ordered[, 4]); +print(""); +print("Total number of result rows: " + nrow(result_ordered)); +print("Total revenue: " + as.integer(total_revenue)); +print("Q3.4 finished"); +} diff --git a/scripts/ssb/queries/q4_1.dml b/scripts/ssb/queries/q4_1.dml new file mode 100644 index 00000000000..d5d4c078662 --- /dev/null +++ b/scripts/ssb/queries/q4_1.dml @@ -0,0 +1,242 @@ +/* DML-script implementing the ssb query Q4.1 in SystemDS with Dynamic Encoding. +SELECT + d_year, + c_nation, + SUM(lo_revenue - lo_supplycost) AS PROFIT +FROM dates, customer, supplier, part, lineorder +WHERE + lo_custkey = c_custkey + AND lo_suppkey = s_suppkey + AND lo_partkey = p_partkey + AND lo_orderdate = d_datekey + AND c_region = 'AMERICA' + AND s_region = 'AMERICA' + AND ( + p_mfgr = 'MFGR#1' + OR p_mfgr = 'MFGR#2' + ) +GROUP BY d_year, c_nation +ORDER BY d_year, c_nation; +*/ + +# Input parameter +input_dir = $input_dir; + +# -- SOURCING THE RA-FUNCTIONS -- +source("./scripts/builtin/raSelection.dml") as raSel +source("./scripts/builtin/raJoin.dml") as raJoin +source("./scripts/builtin/raGroupby.dml") as raGrp + + +# -- READING INPUT FILES -- +# CSV TABLES +date_csv = read(input_dir + "/date.tbl", data_type="frame", format="csv", header=FALSE, sep="|"); +lineorder_csv = read(input_dir + "/lineorder.tbl", data_type="frame", format="csv", header=FALSE, sep="|"); +part_csv = read(input_dir + "/part.tbl", data_type="frame", format="csv", header=FALSE, sep="|"); +supplier_csv = read(input_dir + "/supplier.tbl", data_type="frame", format="csv", header=FALSE, sep="|"); +customer_csv = read(input_dir + "/customer.tbl", data_type="frame", format="csv", header=FALSE, sep="|"); + + +# -- MANUAL FILTERING AND DATA PREPARATION -- +# Extract minimal data needed for the query +date_matrix_min = as.matrix(cbind(date_csv[, 1], date_csv[, 5])); +lineorder_matrix_min = as.matrix(cbind(lineorder_csv[, 3], lineorder_csv[, 4], lineorder_csv[, 5], + lineorder_csv[, 6], lineorder_csv[, 13], lineorder_csv[, 14])); + +# Build filtered parts list (MFGR#1 and MFGR#2) +part_filtered_keys = matrix(0, rows=0, cols=1); + +for(i in 1:nrow(part_csv)) { + mfgr_val = as.scalar(part_csv[i, 3]); + if(mfgr_val == "MFGR#1" | mfgr_val == "MFGR#2") { + # Extract key and create single-element matrix + key_val = as.double(as.scalar(part_csv[i, 1])); + key_matrix = matrix(key_val, rows=1, cols=1); + + # Append to filtered results + part_filtered_keys = rbind(part_filtered_keys, key_matrix); + } +} +part_count = nrow(part_filtered_keys); +if(part_count == 0) { + part_filtered_keys = matrix(0, rows=1, cols=1); # Fallback for empty case +} + +# Build filtered customers list (AMERICA region) with dynamic encoding +cust_filtered_keys = matrix(0, rows=0, cols=1); +cust_filtered_nations = matrix(0, rows=0, cols=1); + +for(i in 1:nrow(customer_csv)) { + region_val = as.scalar(customer_csv[i, 6]); + if(region_val == "AMERICA") { + # Extract key and create single-element matrix + key_val = as.double(as.scalar(customer_csv[i, 1])); + key_matrix = matrix(key_val, rows=1, cols=1); + + # Extract nation and encode + nation_str = as.scalar(customer_csv[i, 5]); + if(nation_str == "ARGENTINA") { + nation_val = 3; + } else if(nation_str == "CANADA") { + nation_val = 5; + } else if(nation_str == "PERU") { + nation_val = 8; + } else if(nation_str == "BRAZIL") { + nation_val = 13; + } else if(nation_str == "UNITED STATES") { + nation_val = 25; + } else { + nation_val = 0; # Unknown nation + } + nation_matrix = matrix(nation_val, rows=1, cols=1); + + # Append to filtered results + cust_filtered_keys = rbind(cust_filtered_keys, key_matrix); + cust_filtered_nations = rbind(cust_filtered_nations, nation_matrix); + } +} + +cust_count = nrow(cust_filtered_keys); +if(cust_count > 0) { + # Create customer matrix from filtered data + cust_filtered_data = cbind(cust_filtered_keys, cust_filtered_nations); +} else { + cust_filtered_data = matrix(0, rows=1, cols=2); # Fallback for empty case +} + +# Build filtered suppliers list (AMERICA region) +supp_filtered_keys = matrix(0, rows=0, cols=1); + +for(i in 1:nrow(supplier_csv)) { + region_val = as.scalar(supplier_csv[i, 6]); + if(region_val == "AMERICA") { + # Extract key and create single-element matrix + key_val = as.double(as.scalar(supplier_csv[i, 1])); + key_matrix = matrix(key_val, rows=1, cols=1); + + # Append to filtered results + supp_filtered_keys = rbind(supp_filtered_keys, key_matrix); + } +} +supp_count = nrow(supp_filtered_keys); +if(supp_count == 0) { + supp_filtered_keys = matrix(0, rows=1, cols=1); # Fallback for empty case +} + +# Ensure filtered matrices are properly formatted +if(cust_count > 0) { + cust_matrix_formatted = cust_filtered_data; # Use the already created matrix +} else { + cust_matrix_formatted = matrix(0, rows=1, cols=2); +} + +if(supp_count > 0) { + supp_matrix_formatted = supp_filtered_keys; # Use the already created matrix +} else { + supp_matrix_formatted = matrix(0, rows=1, cols=1); +} + +if(part_count > 0) { + part_matrix_formatted = part_filtered_keys; # Use the already created matrix +} else { + part_matrix_formatted = matrix(0, rows=1, cols=1); +} + +# -- JOIN TABLES WITH RA-JOIN FUNCTION (SORT-MERGE METHOD) -- +# Remove any potential zero values from customer matrix +valid_cust_mask = (cust_matrix_formatted[, 1] > 0); +if(sum(valid_cust_mask) > 0) { + cust_clean = removeEmpty(target=cust_matrix_formatted, margin="rows", select=valid_cust_mask); +} else { + stop("No valid customer data"); +} + +# Join lineorder with filtered customer table (lo_custkey = c_custkey) +lo_cust = raJoin::m_raJoin(A=lineorder_matrix_min, colA=1, B=cust_clean, colB=1, method="sort-merge"); + +# Join with filtered supplier table (lo_suppkey = s_suppkey) +lo_cust_sup = raJoin::m_raJoin(A=lo_cust, colA=3, B=supp_matrix_formatted, colB=1, method="sort-merge"); + +# Join with filtered part table (lo_partkey = p_partkey) +lo_cust_sup_part = raJoin::m_raJoin(A=lo_cust_sup, colA=2, B=part_matrix_formatted, colB=1, method="sort-merge"); + +# Join with date table (lo_orderdate = d_datekey) +joined_matrix = raJoin::m_raJoin(A=lo_cust_sup_part, colA=4, B=date_matrix_min, colB=1, method="sort-merge"); +# -- GROUP-BY & AGGREGATION -- +lo_revenue = joined_matrix[, 5]; +lo_supplycost = joined_matrix[, 6]; +d_year = joined_matrix[, ncol(joined_matrix)]; # last column (d_year) +c_nation = joined_matrix[, 8]; # customer nation column + +profit = lo_revenue - lo_supplycost; + +# Create nation mapping for grouping +unique_nations = unique(c_nation); +nation_encoding = matrix(0, rows=nrow(unique_nations), cols=1); +for(i in 1:nrow(unique_nations)) { + nation_encoding[i, 1] = i; +} + +# Encode nations to numbers for grouping +c_nation_encoded = matrix(0, rows=nrow(c_nation), cols=1); +for(i in 1:nrow(c_nation)) { + for(j in 1:nrow(unique_nations)) { + if(as.scalar(c_nation[i, 1]) == as.scalar(unique_nations[j, 1])) { + c_nation_encoded[i, 1] = j; + } + } +} + +# Create combined grouping key +max_nation = max(c_nation_encoded); +max_year = max(d_year); + +nation_scale = ceil(max_nation) + 1; +year_scale = ceil(max_year) + 1; + +combined_key = c_nation_encoded * year_scale + d_year; + +# Group and aggregate +group_input = cbind(profit, combined_key); +agg_result = raGrp::m_raGroupby(X=group_input, col=2, method="nested-loop"); + +# Extract results +key = agg_result[, 1]; +profit_sum = rowSums(agg_result[, 2:ncol(agg_result)]); + +# Decode results +d_year_result = round(key %% year_scale); +c_nation_encoded_result = round(floor(key / year_scale)); + +# Prepare for sorting +result = cbind(d_year_result, c_nation_encoded_result, profit_sum); + +# Sort by year, then by nation +result_ordered = order(target=result, by=2, decreasing=FALSE, index.return=FALSE); +result_ordered = order(target=result_ordered, by=1, decreasing=FALSE, index.return=FALSE); + +# Create nation name lookup based on encoding +nation_lookup = matrix(0, rows=nrow(result_ordered), cols=1); +for(i in 1:nrow(result_ordered)) { + nation_idx = as.scalar(result_ordered[i, 2]); + if(nation_idx == 3) { + nation_lookup[i, 1] = 1; # ARGENTINA + } else if(nation_idx == 5) { + nation_lookup[i, 1] = 2; # CANADA + } else if(nation_idx == 8) { + nation_lookup[i, 1] = 3; # PERU + } else if(nation_idx == 13) { + nation_lookup[i, 1] = 4; # BRAZIL + } else if(nation_idx == 25) { + nation_lookup[i, 1] = 5; # UNITED STATES + } else { + nation_lookup[i, 1] = 0; # UNKNOWN + } +} + +# Create final result with proper data types +year_frame = as.frame(result_ordered[, 1]); +profit_frame = as.frame(result_ordered[, 3]); + +# Output final results (Year, Nation_Code, Profit) +print(result_ordered); \ No newline at end of file diff --git a/scripts/ssb/queries/q4_2.dml b/scripts/ssb/queries/q4_2.dml new file mode 100644 index 00000000000..7140713339e --- /dev/null +++ b/scripts/ssb/queries/q4_2.dml @@ -0,0 +1,213 @@ +/* DML-script implementing the ssb query Q4.2 in SystemDS with on-the-fly encoding (no external meta files). +SELECT + d_year, + s_nation, + p_category, + SUM(lo_revenue - lo_supplycost) AS PROFIT +FROM dates, customer, supplier, part, lineorder +WHERE + lo_custkey = c_custkey + AND lo_suppkey = s_suppkey + AND lo_partkey = p_partkey + AND lo_orderdate = d_datekey + AND c_region = 'AMERICA' + AND s_region = 'AMERICA' + AND ( + d_year = 1997 + OR d_year = 1998 + ) + AND ( + p_mfgr = 'MFGR#1' + OR p_mfgr = 'MFGR#2' + ) +GROUP BY d_year, s_nation, p_category +ORDER BY d_year, s_nation, p_category; +*/ + +# -- SOURCING THE RA-FUNCTIONS -- +source("./scripts/builtin/raSelection.dml") as raSel +source("./scripts/builtin/raJoin.dml") as raJoin +source("./scripts/builtin/raGroupby.dml") as raGrp + +## Input parameter +input_dir = $input_dir; + +# -- READING INPUT FILES -- +# CSV TABLES +date_csv = read(input_dir + "/date.tbl", data_type="frame", format="csv", header=FALSE, sep="|"); +lineorder_csv = read(input_dir + "/lineorder.tbl", data_type="frame", format="csv", header=FALSE, sep="|"); +part_csv = read(input_dir + "/part.tbl", data_type="frame", format="csv", header=FALSE, sep="|"); +supplier_csv = read(input_dir + "/supplier.tbl", data_type="frame", format="csv", header=FALSE, sep="|"); +customer_csv = read(input_dir + "/customer.tbl", data_type="frame", format="csv", header=FALSE, sep="|"); + + +# -- PREPARING -- +# EXTRACTING MINIMAL DATE DATA TO OPTIMIZE RUNTIME => COL-1 : DATE-KEY | COL-5 : D_YEAR +date_csv_min = cbind(date_csv[, 1], date_csv[, 5]); +date_matrix_min = as.matrix(date_csv_min); + +# EXTRACTING MINIMAL LINEORDER DATA TO OPTIMIZE RUNTIME => COL-3 : LO_CUSTKEY | COL-4 : LO_PARTKEY | +# COL-5 : LO_SUPPKEY | COL-6 : LO_ORDERDATE | COL-13 : LO_REVENUE | COL-14 : LO_SUPPLYCOST +lineorder_csv_min = cbind(lineorder_csv[, 3], lineorder_csv[, 4], lineorder_csv[, 5], lineorder_csv[, 6], lineorder_csv[, 13], lineorder_csv[, 14]); +lineorder_matrix_min = as.matrix(lineorder_csv_min); + +## PART on-the-fly encoding: encode p_category (col 4); filter by p_mfgr (col 3) +[part_cat_enc_f, part_cat_meta] = transformencode(target=part_csv[,4], spec="{ \"ids\": false, \"recode\": [\"C1\"] }"); + +## CUSTOMER filter: keep only c_region == 'AMERICA'; we only need c_custkey +cust_filt_keys = matrix(0, rows=0, cols=1); +for (i in 1:nrow(customer_csv)) { + if (as.scalar(customer_csv[i,6]) == "AMERICA") { + key_val = as.double(as.scalar(customer_csv[i,1])); + cust_filt_keys = rbind(cust_filt_keys, matrix(key_val, rows=1, cols=1)); + } +} +if (nrow(cust_filt_keys) == 0) { cust_filt_keys = matrix(0, rows=1, cols=1); } + +## SUPPLIER on-the-fly encoding: encode s_nation (col 5); filter by s_region (col 6) +[sup_nat_enc_f, sup_nat_meta] = transformencode(target=supplier_csv[,5], spec="{ \"ids\": false, \"recode\": [\"C1\"] }"); +sup_filt_keys = matrix(0, rows=0, cols=1); +sup_filt_nat = matrix(0, rows=0, cols=1); +for (i in 1:nrow(supplier_csv)) { + if (as.scalar(supplier_csv[i,6]) == "AMERICA") { + key_val = as.double(as.scalar(supplier_csv[i,1])); + nat_code = as.double(as.scalar(sup_nat_enc_f[i,1])); + sup_filt_keys = rbind(sup_filt_keys, matrix(key_val, rows=1, cols=1)); + sup_filt_nat = rbind(sup_filt_nat, matrix(nat_code, rows=1, cols=1)); + } +} +if (nrow(sup_filt_keys) == 0) { sup_filt_keys = matrix(0, rows=1, cols=1); sup_filt_nat = matrix(0, rows=1, cols=1); } +sup_filt = cbind(sup_filt_keys, sup_filt_nat); + + +## -- FILTERING THE DATA -- +# P_MFGR = 'MFGR#1' OR 'MFGR#2' -> build filtered part table keeping key and encoded category +part_filt_keys = matrix(0, rows=0, cols=1); +part_filt_cat = matrix(0, rows=0, cols=1); +for (i in 1:nrow(part_csv)) { + mfgr_val = as.scalar(part_csv[i,3]); + if (mfgr_val == "MFGR#1" | mfgr_val == "MFGR#2") { + key_val = as.double(as.scalar(part_csv[i,1])); + cat_code = as.double(as.scalar(part_cat_enc_f[i,1])); + part_filt_keys = rbind(part_filt_keys, matrix(key_val, rows=1, cols=1)); + part_filt_cat = rbind(part_filt_cat, matrix(cat_code, rows=1, cols=1)); + } +} +if (nrow(part_filt_keys) == 0) { part_filt_keys = matrix(0, rows=1, cols=1); part_filt_cat = matrix(0, rows=1, cols=1); } +part_filt = cbind(part_filt_keys, part_filt_cat); + +## D_YEAR = 1997 OR 1998 +d_year_filt_1 = raSel::m_raSelection(date_matrix_min, col=2, op="==", val=1997); +d_year_filt_2 = raSel::m_raSelection(date_matrix_min, col=2, op="==", val=1998); +d_year_filt = rbind(d_year_filt_1, d_year_filt_2); + + +# -- JOIN TABLES WITH RA-JOIN FUNCTION -- +## -- JOIN TABLES WITH RA-JOIN FUNCTION -- +# JOINING MINIMIZED LINEORDER TABLE WITH FILTERED CUSTOMER TABLE WHERE LO_CUSTKEY = C_CUSTKEY +lo_cust = raJoin::m_raJoin(A=lineorder_matrix_min, colA=1, B=cust_filt_keys, colB=1, method="sort-merge"); + +# JOIN: ⨝ SUPPLIER WHERE LO_SUPPKEY = S_SUPPKEY (carry s_nation code) +lo_cust_sup = raJoin::m_raJoin(A=lo_cust, colA=3, B=sup_filt, colB=1, method="sort-merge"); + +# JOIN: ⨝ PART WHERE LO_PARTKEY = P_PARTKEY (carry p_category code) +lo_cust_sup_part = raJoin::m_raJoin(A=lo_cust_sup, colA=2, B=part_filt, colB=1, method="sort-merge"); + +# JOIN: ⨝ DATE WHERE LO_ORDERDATE = D_DATEKEY +joined_matrix = raJoin::m_raJoin(A=lo_cust_sup_part, colA=4, B=d_year_filt, colB=1, method="sort-merge"); + + +# -- GROUP-BY & AGGREGATION -- +# LO_REVENUE : COLUMN 5 OF LINEORDER-MIN-MATRIX +lo_revenue = joined_matrix[, 5]; +# LO_SUPPLYCOST : COLUMN 6 OF LINEORDER-MIN-MATRIX +lo_supplycost = joined_matrix[, 6]; +# D_YEAR : COLUMN 2 OF DATE-MIN-MATRIX (last added 2nd col) +d_year = joined_matrix[,(ncol(lineorder_matrix_min) + ncol(cust_filt_keys) + ncol(sup_filt) + ncol(part_filt) + 2)]; +# S_NATION (encoded) : COLUMN 2 OF SUPPLIER-FILTERED MATRIX +s_nation = joined_matrix[,(ncol(lineorder_matrix_min) + ncol(cust_filt_keys) + 2)]; +# P_CATEGORY (encoded) : COLUMN 2 OF PART-FILTERED MATRIX +p_category = joined_matrix[,(ncol(lineorder_matrix_min) + ncol(cust_filt_keys) + ncol(sup_filt) + 2)]; + +profit = lo_revenue - lo_supplycost; + +# CALCULATING COMBINATION KEY WITH PRIORITY: D_YEAR, S_NATION, P_CATEGORY (internal codes for grouping) +max_s_nation_grp = max(s_nation); +max_p_category_grp = max(p_category); +max_d_year_grp = max(d_year); + +s_nation_scale_grp = ceil(max_s_nation_grp) + 1; +p_category_scale_grp = ceil(max_p_category_grp) + 1; +d_year_scale_grp = ceil(max_d_year_grp) + 1; + +combined_key_grp = d_year * s_nation_scale_grp * p_category_scale_grp + s_nation * p_category_scale_grp + p_category; + +group_input = cbind(profit, combined_key_grp); +agg_result = raGrp::m_raGroupby(X=group_input, col=2, method="nested-loop"); + +key_grp = agg_result[, 1]; +profit_sum = rowSums(agg_result[, 2:ncol(agg_result)]); + +# EXTRACTING D_YEAR, S_NATION, P_CATEGORY (internal codes) +d_year_grp = round(floor(key_grp / (s_nation_scale_grp * p_category_scale_grp))); +s_nation_grp = round(floor((key_grp %% (s_nation_scale_grp * p_category_scale_grp)) / p_category_scale_grp)); +p_category_grp = round(key_grp %% p_category_scale_grp); + +# Decode specs for later +sup_dec_spec = "{ \"recode\": [\"C1\"] }"; +part_dec_spec = "{ \"recode\": [\"C1\"] }"; + +# Decode categories for display-code mapping (unordered) +p_cat_dec_all = transformdecode(target=p_category_grp, spec=part_dec_spec, meta=part_cat_meta); + +# Build display codes to match legacy meta mapping for p_category +p_category_disp = matrix(0, rows=nrow(p_cat_dec_all), cols=1); +for (i in 1:nrow(p_cat_dec_all)) { + cat_str = as.scalar(p_cat_dec_all[i,1]); + if (cat_str == "MFGR#11") p_category_disp[i,1] = 1; + else if (cat_str == "MFGR#12") p_category_disp[i,1] = 2; + else if (cat_str == "MFGR#13") p_category_disp[i,1] = 6; + else if (cat_str == "MFGR#15") p_category_disp[i,1] = 20; + else if (cat_str == "MFGR#21") p_category_disp[i,1] = 14; + else if (cat_str == "MFGR#22") p_category_disp[i,1] = 10; + else if (cat_str == "MFGR#23") p_category_disp[i,1] = 25; + else if (cat_str == "MFGR#24") p_category_disp[i,1] = 24; + else if (cat_str == "MFGR#25") p_category_disp[i,1] = 5; + else p_category_disp[i,1] = as.double(0); +} + +# s_nation codes already align with legacy mapping; reuse as display codes +s_nation_disp = s_nation_grp; + +# Compute display key using display codes +s_nation_scale_disp = ceil(max(s_nation_disp)) + 1; +p_category_scale_disp = ceil(max(p_category_disp)) + 1; +d_year_scale_disp = ceil(max(d_year_grp)) + 1; + +key_disp = d_year_grp * s_nation_scale_disp * p_category_scale_disp + s_nation_disp * p_category_scale_disp + p_category_disp; + +# Compose display result and sort by display key to match legacy order +result_disp = cbind(d_year_grp, s_nation_disp, p_category_disp, profit_sum, key_disp); +idx_order = order(target=result_disp, by=5, decreasing=FALSE, index.return=TRUE); +result_ordered_disp = order(target=result_disp, by=5, decreasing=FALSE, index.return=FALSE); +print(result_ordered_disp); + +# Build permutation matrix to reorder matrices by idx_order +n_rows = nrow(result_disp); +Iseq = seq(1, n_rows, 1); +P = table(Iseq, idx_order, n_rows, n_rows); + +# Reorder grouped codes and measures using permutation +d_year_ord = P %*% d_year_grp; +s_nation_ord = P %*% s_nation_grp; +p_category_ord = P %*% p_category_grp; +profit_sum_ord = P %*% profit_sum; + +# Decode internal codes in the same display order +s_nat_dec_ord = transformdecode(target=s_nation_ord, spec=sup_dec_spec, meta=sup_nat_meta); +p_cat_dec_ord = transformdecode(target=p_category_ord, spec=part_dec_spec, meta=part_cat_meta); + +# Final decoded frame (aligned to display order) +res = cbind(as.frame(d_year_ord), s_nat_dec_ord, p_cat_dec_ord, as.frame(profit_sum_ord)); +print(res); + diff --git a/scripts/ssb/queries/q4_3.dml b/scripts/ssb/queries/q4_3.dml new file mode 100644 index 00000000000..69462151089 --- /dev/null +++ b/scripts/ssb/queries/q4_3.dml @@ -0,0 +1,173 @@ +# DML-script implementing the ssb query Q4.3 in SystemDS. + +/* DML-script implementing the ssb query Q4.3 in SystemDS with on-the-fly encoding (no external meta files). +SELECT + d_year, + s_city, + p_brand, + SUM(lo_revenue - lo_supplycost) AS PROFIT +FROM dates, customer, supplier, part, lineorder +WHERE + lo_custkey = c_custkey + AND lo_suppkey = s_suppkey + AND lo_partkey = p_partkey + AND lo_orderdate = d_datekey + AND s_nation = 'UNITED STATES' + AND ( + d_year = 1997 + OR d_year = 1998 + ) + AND p_category = 'MFGR#14' +GROUP BY d_year, s_city, p_brand +ORDER BY d_year, s_city, p_brand; +*/ + +# -- SOURCING THE RA-FUNCTIONS -- +source("./scripts/builtin/raSelection.dml") as raSel +source("./scripts/builtin/raJoin.dml") as raJoin +source("./scripts/builtin/raGroupby.dml") as raGrp + +## Input parameter +input_dir = $input_dir; + +# -- READING INPUT FILES -- +# CSV TABLES +date_csv = read(input_dir + "/date.tbl", data_type="frame", format="csv", header=FALSE, sep="|"); +lineorder_csv = read(input_dir + "/lineorder.tbl", data_type="frame", format="csv", header=FALSE, sep="|"); +part_csv = read(input_dir + "/part.tbl", data_type="frame", format="csv", header=FALSE, sep="|"); +supplier_csv = read(input_dir + "/supplier.tbl", data_type="frame", format="csv", header=FALSE, sep="|"); +customer_csv = read(input_dir + "/customer.tbl", data_type="frame", format="csv", header=FALSE, sep="|"); + + +# -- PREPARING -- +# EXTRACTING MINIMAL DATE DATA TO OPTIMIZE RUNTIME => COL-1 : DATE-KEY | COL-5 : D_YEAR +date_csv_min = cbind(date_csv[, 1], date_csv[, 5]); +date_matrix_min = as.matrix(date_csv_min); + +# EXTRACTING MINIMAL LINEORDER DATA TO OPTIMIZE RUNTIME => COL-3 : LO_CUSTKEY | COL-4 : LO_PARTKEY | +# COL-5 : LO_SUPPKEY | COL-6 : LO_ORDERDATE | COL-13 : LO_REVENUE | COL-14 : LO_SUPPLYCOST +lineorder_csv_min = cbind(lineorder_csv[, 3], lineorder_csv[, 4], lineorder_csv[, 5], lineorder_csv[, 6], lineorder_csv[, 13], lineorder_csv[, 14]); +lineorder_matrix_min = as.matrix(lineorder_csv_min); + +## Prepare PART on-the-fly encodings (only need p_brand encoding, filter by p_category string) +# We'll encode column 5 (p_brand) on-the-fly and later filter by category string 'MFGR#14'. +[part_brand_enc_f, part_brand_meta] = transformencode(target=part_csv[,5], spec="{ \"ids\": false, \"recode\": [\"C1\"] }"); + +# EXTRACTING MINIMAL CUSTOMER DATA TO OPTIMIZE RUNTIME => COL-1 : CUSTOMER-KEY +cust_csv_min = customer_csv[, 1]; +cust_matrix_min = as.matrix(cust_csv_min); + +## Prepare SUPPLIER on-the-fly encodings (encode s_city, filter by s_nation string) +[sup_city_enc_f, sup_city_meta] = transformencode(target=supplier_csv[,4], spec="{ \"ids\": false, \"recode\": [\"C1\"] }"); + + +## -- FILTERING THE DATA WITH RA-SELECTION FUNCTION / LOOPS -- +# D_YEAR = 1997 OR 1998 +d_year_filt_1 = raSel::m_raSelection(date_matrix_min, col=2, op="==", val=1997); +d_year_filt_2 = raSel::m_raSelection(date_matrix_min, col=2, op="==", val=1998); +d_year_filt = rbind(d_year_filt_1, d_year_filt_2); + +# Build filtered SUPPLIER table (s_nation == 'UNITED STATES'), keeping key and encoded city +sup_filt_keys = matrix(0, rows=0, cols=1); +sup_filt_city = matrix(0, rows=0, cols=1); +for (i in 1:nrow(supplier_csv)) { + if (as.scalar(supplier_csv[i,5]) == "UNITED STATES") { + key_val = as.double(as.scalar(supplier_csv[i,1])); + city_code = as.double(as.scalar(sup_city_enc_f[i,1])); + sup_filt_keys = rbind(sup_filt_keys, matrix(key_val, rows=1, cols=1)); + sup_filt_city = rbind(sup_filt_city, matrix(city_code, rows=1, cols=1)); + } +} +if (nrow(sup_filt_keys) == 0) { + # Fallback to avoid empty join + sup_filt_keys = matrix(0, rows=1, cols=1); + sup_filt_city = matrix(0, rows=1, cols=1); +} +sup_filt = cbind(sup_filt_keys, sup_filt_city); + +# Build filtered PART table (p_category == 'MFGR#14'), keeping key and encoded brand +part_filt_keys = matrix(0, rows=0, cols=1); +part_filt_brand = matrix(0, rows=0, cols=1); +for (i in 1:nrow(part_csv)) { + if (as.scalar(part_csv[i,4]) == "MFGR#14") { + key_val = as.double(as.scalar(part_csv[i,1])); + brand_code = as.double(as.scalar(part_brand_enc_f[i,1])); + part_filt_keys = rbind(part_filt_keys, matrix(key_val, rows=1, cols=1)); + part_filt_brand = rbind(part_filt_brand, matrix(brand_code, rows=1, cols=1)); + } +} +if (nrow(part_filt_keys) == 0) { + part_filt_keys = matrix(0, rows=1, cols=1); + part_filt_brand = matrix(0, rows=1, cols=1); +} +part_filt = cbind(part_filt_keys, part_filt_brand); + + +# -- JOIN TABLES WITH RA-JOIN FUNCTION -- +# JOINING MINIMIZED LINEORDER TABLE WITH FILTERED SUPPLIER TABLE WHERE LO_SUPPKEY = S_SUPPKEY +lo_sup = raJoin::m_raJoin(A=lineorder_matrix_min, colA=3, B=sup_filt, colB=1, method="sort-merge"); + +# JOIN: ⨝ PART WHERE LO_PARTKEY = P_PARTKEY +lo_sup_part = raJoin::m_raJoin(A=lo_sup, colA=2, B=part_filt, colB=1, method="sort-merge"); + +# JOIN: ⨝ DATE WHERE LO_ORDERDATE = D_DATEKEY +lo_sup_part_date = raJoin::m_raJoin(A=lo_sup_part, colA=4, B=d_year_filt, colB=1, method="sort-merge"); + +# JOIN: ⨝ CUSTOMER WHERE LO_CUSTKEY = C_CUSTKEY (no filter used, but keep join for parity) +cust_matrix_min = as.matrix(customer_csv[,1]); +joined_matrix = raJoin::m_raJoin(A=lo_sup_part_date, colA=1, B=cust_matrix_min, colB=1, method="sort-merge"); + + +# -- GROUP-BY & AGGREGATION -- +# LO_REVENUE : COLUMN 5 OF LINEORDER-MIN-MATRIX +lo_revenue = joined_matrix[, 5]; +# LO_SUPPLYCOST : COLUMN 6 OF LINEORDER-MIN-MATRIX +lo_supplycost = joined_matrix[, 6]; +# D_YEAR : last column added in the previous join with date (2nd col of date_min) +d_year = joined_matrix[,(ncol(lineorder_matrix_min) + ncol(sup_filt) + ncol(part_filt) + 2)]; +# S_CITY (encoded) : COLUMN 2 OF SUPPLIER-FILTERED MATRIX +s_city = joined_matrix[,(ncol(lineorder_matrix_min) + 2)]; +# P_BRAND (encoded) : COLUMN 2 OF PART-FILTERED MATRIX +p_brand = joined_matrix[,(ncol(lineorder_matrix_min) + ncol(sup_filt) + 2)]; + +profit = lo_revenue - lo_supplycost; + +# CALCULATING COMBINATION KEY WITH PRIORITY: D_YEAR, S_CITY, P_BRAND +max_s_city = max(s_city); +max_p_brand = max(p_brand); +max_d_year = max(d_year); + +s_city_scale_f = ceil(max_s_city) + 1; +p_brand_scale_f = ceil(max_p_brand) + 1; +d_year_scale_f = ceil(max_d_year) + 1; + +combined_key = d_year * s_city_scale_f * p_brand_scale_f + s_city * p_brand_scale_f + p_brand; + +group_input = cbind(profit, combined_key); +agg_result = raGrp::m_raGroupby(X=group_input, col=2, method="nested-loop"); + +key = agg_result[, 1]; +profit = rowSums(agg_result[, 2:ncol(agg_result)]); + +# EXTRACTING D_YEAR, S_CITY, P_BRAND +d_year = round(floor(key / (s_city_scale_f * p_brand_scale_f))); +s_city = round(floor((key %% (s_city_scale_f * p_brand_scale_f)) / p_brand_scale_f)); +p_brand = round(key %% p_brand_scale_f); + +result = cbind(d_year, s_city, p_brand, profit, key); + +# -- SORTING -- +# PRIORITY 1 D_YEAR, 2 S_CITY, 3 P_BRAND +result_ordered = order(target=result, by=5, decreasing=FALSE, index.return=FALSE); +print(result_ordered); + +# -- DECODING S_CITY & P_BRAND (using on-the-fly meta from transformencode) -- +sup_dec_spec = "{ \"recode\": [\"C1\"] }"; +part_dec_spec = "{ \"recode\": [\"C1\"] }"; + +s_city_dec = transformdecode(target=result_ordered[, 2], spec=sup_dec_spec, meta=sup_city_meta); +p_brand_dec = transformdecode(target=result_ordered[, 3], spec=part_dec_spec, meta=part_brand_meta); + +res = cbind(as.frame(result_ordered[, 1]), s_city_dec, p_brand_dec, as.frame(result_ordered[, 4])); + +print(res); diff --git a/scripts/ssb/shell/run_all_perf.sh b/scripts/ssb/shell/run_all_perf.sh new file mode 100755 index 00000000000..9210f97ba1d --- /dev/null +++ b/scripts/ssb/shell/run_all_perf.sh @@ -0,0 +1,1509 @@ +#!/usr/bin/env bash +# +# Multi-Engine SSB Performance Benchmark Runner +# ============================================= +# +# CORE SCRIPTS STATUS: +# - Version: 1.0 (September 5, 2025) +# - Status: Production-Ready with Advanced Statistical Analysis +# +# ENHANCED FEATURES IMPLEMENTED: +# ✓ Multi-engine benchmarking (SystemDS, PostgreSQL, DuckDB) +# ✓ Advanced statistical analysis (mean, stdev, p95, CV) with high-precision calculations +# ✓ Single-pass timing optimization eliminating cache effects between measurements +# ✓ Cross-engine core timing support (SystemDS stats, PostgreSQL EXPLAIN, DuckDB JSON profiling) +# ✓ Adaptive terminal layout with dynamic column scaling and multi-row statistics display +# ✓ Comprehensive metadata collection (system info, software versions, data build info) +# ✓ Environment verification and graceful degradation for missing engines +# ✓ Real-time progress indicators with proper terminal width handling +# ✓ Precision timing measurements with millisecond accuracy using /usr/bin/time -p +# ✓ Robust error handling with pre-flight validation and error propagation +# ✓ CSV and JSON output with timestamped files and complete statistical data +# ✓ Fastest engine detection with tie handling +# ✓ Database connection validation and parallel execution control (disabled for fair comparison) +# ✓ Cross-platform compatibility (macOS/Linux) with intelligent executable discovery +# ✓ Reproducible benchmarking with configurable seeds and detailed run configuration +# +# RECENT IMPORTANT ADDITIONS: +# - Accepts --input-dir=PATH and forwards it into SystemDS DML runs via +# `-nvargs input_dir=/path/to/data`. This allows DML queries to load data from +# custom locations without hardcoded paths. +# - Runner performs a pre-flight input-dir existence check and exits early with +# a clear message when the directory is missing. +# - Test-run output is scanned for runtime SystemDS errors; when detected the +# runner marks the query as failed and includes an `error_message` field in +# the generated JSON results to aid debugging and CI automation. +# +# STATISTICAL MEASUREMENTS: +# - Mean: Arithmetic average execution time (typical performance expectation) +# - Standard Deviation: Population stdev measuring consistency/reliability +# - P95 Percentile: 95th percentile for worst-case performance bounds +# - Coefficient of Variation: Relative variability as percentage for cross-scale comparison +# - Display Format: "1200.0 (±14.1ms/1.2%, p95:1220.0ms)" showing all key metrics +# +# ENGINES SUPPORTED: +# - SystemDS: Machine learning platform with DML queries (single-threaded via XML config) +# - PostgreSQL: Industry-standard relational database (parallel workers disabled) +# - DuckDB: High-performance analytical database (single-threaded via PRAGMA) +# +# USAGE (from repo root): +# scripts/ssb/shell/run_all_perf.sh # run full benchmark with all engines +# scripts/ssb/shell/run_all_perf.sh --stats # enable internal engine timing statistics +# scripts/ssb/shell/run_all_perf.sh --warmup=3 --repeats=10 # custom warmup and repetition settings +# scripts/ssb/shell/run_all_perf.sh --layout=wide # force wide table layout +# scripts/ssb/shell/run_all_perf.sh --seed=12345 # reproducible benchmark with specific seed +# scripts/ssb/shell/run_all_perf.sh q1.1 q2.3 q4.1 # benchmark specific queries only +# +set -euo pipefail +export LC_ALL=C + +REPEATS=5 +WARMUP=1 +POSTGRES_DB="ssb" +POSTGRES_USER="$(whoami)" +POSTGRES_HOST="localhost" + +export _JAVA_OPTIONS="${_JAVA_OPTIONS:-} -Xms2g -Xmx2g -XX:+UseParallelGC -XX:ParallelGCThreads=1" + +# Determine script directory and project root (repo root) +if command -v realpath >/dev/null 2>&1; then + SCRIPT_DIR="$(dirname "$(realpath "$0")")" +else + SCRIPT_DIR="$(python - <<'PY' +import os, sys +print(os.path.dirname(os.path.abspath(sys.argv[1]))) +PY +"$0")" +fi +# Resolve repository root robustly (script may be in scripts/ssb/shell) +if command -v git >/dev/null 2>&1 && git -C "$SCRIPT_DIR" rev-parse --show-toplevel >/dev/null 2>&1; then + PROJECT_ROOT="$(git -C "$SCRIPT_DIR" rev-parse --show-toplevel)" +else + # Fallback: ascend until we find markers (.git or pom.xml) + __dir="$SCRIPT_DIR" + PROJECT_ROOT="" + while [[ "$__dir" != "/" ]]; do + if [[ -d "$__dir/.git" || -f "$__dir/pom.xml" ]]; then + PROJECT_ROOT="$__dir"; break + fi + __dir="$(dirname "$__dir")" + done + : "${PROJECT_ROOT:=$(cd "$SCRIPT_DIR/../../../" && pwd)}" +fi + +# Create single-thread configuration +CONF_DIR="$PROJECT_ROOT/conf" +SINGLE_THREAD_CONF="$CONF_DIR/single_thread.xml" +mkdir -p "$CONF_DIR" +if [[ ! -f "$SINGLE_THREAD_CONF" ]]; then +cat > "$SINGLE_THREAD_CONF" <<'XML' + + + sysds.cp.parallel.opsfalse + + + sysds.num.threads1 + + +XML +fi +SYS_EXTRA_ARGS=( "-config" "$SINGLE_THREAD_CONF" ) + +# Query and system directories +QUERY_DIR="$PROJECT_ROOT/scripts/ssb/queries" + +# Locate SystemDS binary +SYSTEMDS_CMD="$PROJECT_ROOT/bin/systemds" +if [[ ! -x "$SYSTEMDS_CMD" ]]; then + SYSTEMDS_CMD="$(command -v systemds || true)" +fi +if [[ -z "$SYSTEMDS_CMD" || ! -x "$SYSTEMDS_CMD" ]]; then + echo "SystemDS binary not found." >&2 + exit 1 +fi + +# Database directories and executables +# SQL files were moved under scripts/ssb/sql +SQL_DIR="$PROJECT_ROOT/scripts/ssb/sql" + +# Try to find PostgreSQL psql executable +PSQL_EXEC="" +for path in "/opt/homebrew/opt/libpq/bin/psql" "/usr/local/bin/psql" "/usr/bin/psql" "$(command -v psql || true)"; do + if [[ -x "$path" ]]; then + PSQL_EXEC="$path" + break + fi +done + +# Try to find DuckDB executable +DUCKDB_EXEC="" +for path in "/opt/homebrew/bin/duckdb" "/usr/local/bin/duckdb" "/usr/bin/duckdb" "$(command -v duckdb || true)"; do + if [[ -x "$path" ]]; then + DUCKDB_EXEC="$path" + break + fi +done + +DUCKDB_DB_PATH="$SQL_DIR/ssb.duckdb" + +# Environment verification +verify_environment() { + local ok=true + echo "Verifying environment..." + + if [[ ! -x "$SYSTEMDS_CMD" ]]; then + echo "✗ SystemDS binary missing ($SYSTEMDS_CMD)" >&2 + ok=false + else + echo "✓ SystemDS binary found: $SYSTEMDS_CMD" + fi + + if [[ -z "$PSQL_EXEC" || ! -x "$PSQL_EXEC" ]]; then + echo "✗ psql not found (tried common paths)" >&2 + echo " PostgreSQL benchmarks will be skipped" >&2 + PSQL_EXEC="" + else + echo "✓ psql found: $PSQL_EXEC" + if ! "$PSQL_EXEC" -U "$POSTGRES_USER" -h "$POSTGRES_HOST" -d "$POSTGRES_DB" -c "SELECT 1" >/dev/null 2>&1; then + echo "✗ Could not connect to PostgreSQL database ($POSTGRES_DB)" >&2 + echo " PostgreSQL benchmarks will be skipped" >&2 + PSQL_EXEC="" + else + echo "✓ PostgreSQL database connection successful" + fi + fi + + if [[ -z "$DUCKDB_EXEC" || ! -x "$DUCKDB_EXEC" ]]; then + echo "✗ DuckDB not found (tried common paths)" >&2 + echo " DuckDB benchmarks will be skipped" >&2 + DUCKDB_EXEC="" + else + echo "✓ DuckDB found: $DUCKDB_EXEC" + if [[ ! -f "$DUCKDB_DB_PATH" ]]; then + echo "✗ DuckDB database missing ($DUCKDB_DB_PATH)" >&2 + echo " DuckDB benchmarks will be skipped" >&2 + DUCKDB_EXEC="" + elif ! "$DUCKDB_EXEC" "$DUCKDB_DB_PATH" -c "SELECT 1" >/dev/null 2>&1; then + echo "✗ DuckDB database could not be opened" >&2 + echo " DuckDB benchmarks will be skipped" >&2 + DUCKDB_EXEC="" + else + echo "✓ DuckDB database accessible" + fi + fi + + if [[ ! -x "$SYSTEMDS_CMD" ]]; then + echo "Error: SystemDS is required but not found" >&2 + exit 1 + fi + + echo "" +} + +# Convert seconds to milliseconds +sec_to_ms() { + awk -v sec="$1" 'BEGIN{printf "%.1f", sec * 1000}' +} + +# Statistical functions for multiple measurements +calculate_statistics() { + local values=("$@") + local n=${#values[@]} + + if [[ $n -eq 0 ]]; then + echo "0|0|0" + return + fi + + if [[ $n -eq 1 ]]; then + # mean|stdev|p95 + printf '%.1f|0.0|%.1f\n' "${values[0]}" "${values[0]}" + return + fi + + # Compute mean and population stdev with higher precision in a single awk pass + local mean_stdev + mean_stdev=$(printf '%s\n' "${values[@]}" | awk ' + { x[NR]=$1; s+=$1 } + END { + n=NR; if(n==0){ printf "0|0"; exit } + m=s/n; + ss=0; for(i=1;i<=n;i++){ d=x[i]-m; ss+=d*d } + stdev=sqrt(ss/n); + printf "%.6f|%.6f", m, stdev + }') + + local mean=$(echo "$mean_stdev" | cut -d'|' -f1) + local stdev=$(echo "$mean_stdev" | cut -d'|' -f2) + + # Calculate p95 (nearest-rank: ceil(0.95*n)) + local sorted_values=($(printf '%s\n' "${values[@]}" | sort -n)) + local p95_index=$(awk -v n="$n" 'BEGIN{ idx = int(0.95*n + 0.999999); if(idx<1) idx=1; if(idx>n) idx=n; print idx-1 }') + local p95=${sorted_values[$p95_index]} + + # Format to one decimal place + printf '%.1f|%.1f|%.1f\n' "$mean" "$stdev" "$p95" +} + +# Format statistics for display +format_statistics() { + local mean="$1" + local stdev="$2" + local p95="$3" + local repeats="$4" + + if [[ $repeats -eq 1 ]]; then + echo "$mean" + else + # Calculate coefficient of variation (CV) as percentage + local cv_percent=0 + if [[ $(awk -v mean="$mean" 'BEGIN{print (mean > 0)}') -eq 1 ]]; then + cv_percent=$(awk -v stdev="$stdev" -v mean="$mean" 'BEGIN{printf "%.1f", (stdev * 100) / mean}') + fi + echo "$mean (±${stdev}ms/${cv_percent}%, p95:${p95}ms)" + fi +} + +# Format only the stats line (without the mean), e.g., "(±10.2ms/0.6%, p95:1740.0ms)" +format_stats_only() { + local mean="$1" + local stdev="$2" + local p95="$3" + local repeats="$4" + + if [[ $repeats -eq 1 ]]; then + echo "" + return + fi + # Only for numeric means + if ! [[ "$mean" =~ ^[0-9]+(\.[0-9]+)?$ ]]; then + echo "" + return + fi + local cv_percent=0 + if [[ $(awk -v mean="$mean" 'BEGIN{print (mean > 0)}') -eq 1 ]]; then + cv_percent=$(awk -v stdev="$stdev" -v mean="$mean" 'BEGIN{printf "%.1f", (stdev * 100) / mean}') + fi + echo "(±${stdev}ms/${cv_percent}%, p95:${p95}ms)" +} + +# Format only the CV line (±stdev/CV%) +format_cv_only() { + local mean="$1"; local stdev="$2"; local repeats="$3" + if [[ $repeats -eq 1 ]]; then echo ""; return; fi + if ! [[ "$mean" =~ ^[0-9]+(\.[0-9]+)?$ ]]; then echo ""; return; fi + local cv_percent=0 + if [[ $(awk -v mean="$mean" 'BEGIN{print (mean > 0)}') -eq 1 ]]; then + cv_percent=$(awk -v stdev="$stdev" -v mean="$mean" 'BEGIN{printf "%.1f", (stdev * 100) / mean}') + fi + echo "±${stdev}ms/${cv_percent}%" +} + +# Format only the p95 line +format_p95_only() { + local p95="$1"; local repeats="$2" + if [[ $repeats -eq 1 ]]; then echo ""; return; fi + if ! [[ "$p95" =~ ^[0-9]+(\.[0-9]+)?$ ]]; then echo ""; return; fi + echo "p95:${p95}ms" +} + +# Column widths for wide layout - optimized for 125-char terminals +WIDE_COL_WIDTHS=(8 14 14 12 16 12 12 18) + +# Draw a grid line like +----------+----------------+... +grid_line_wide() { + local parts=("+") + for w in "${WIDE_COL_WIDTHS[@]}"; do + parts+=("$(printf '%*s' "$((w+2))" '' | tr ' ' '-')+") + done + printf '%s\n' "${parts[*]}" | tr -d ' ' +} + +# Print a grid row with vertical separators using the wide layout widths +grid_row_wide() { + local -a cells=("$@") + local cols=${#WIDE_COL_WIDTHS[@]} + while [[ ${#cells[@]} -lt $cols ]]; do + cells+=("") + done + + # Build a printf format string that right-aligns numeric and statistic-like cells + # (numbers, lines starting with ± or p95, or containing p95/±) while leaving the + # first column (query) left-aligned for readability. + local fmt="" + for i in $(seq 0 $((cols-1))); do + local w=${WIDE_COL_WIDTHS[i]} + if [[ $i -eq 0 ]]; then + # Query name: left-align + fmt+="| %-${w}s" + else + local cell="${cells[i]}" + # Heuristic: right-align if the cell is a plain number or contains statistic markers + if [[ "$cell" =~ ^[[:space:]]*[0-9]+(\.[0-9]+)?[[:space:]]*$ ]] || [[ "$cell" == ±* ]] || [[ "$cell" == *'±'* ]] || [[ "$cell" == p95* ]] || [[ "$cell" == *'p95'* ]] || [[ "$cell" == \(* ]]; then + fmt+=" | %${w}s" + else + fmt+=" | %-${w}s" + fi + fi + done + fmt+=" |\n" + + printf "$fmt" "${cells[@]}" +} + +# Time a command and return real time in ms +time_command_ms() { + local out + # Properly capture stderr from /usr/bin/time while suppressing stdout of the command + out=$({ /usr/bin/time -p "$@" > /dev/null; } 2>&1) + local real_sec=$(echo "$out" | awk '/^real /{print $2}') + if [[ -z "$real_sec" || ! "$real_sec" =~ ^[0-9]+(\.[0-9]+)?$ ]]; then + echo "(error)" + return 1 + fi + sec_to_ms "$real_sec" +} + +# Time a command, capturing stdout to a file, and return real time in ms +time_command_ms_capture() { + local stdout_file="$1"; shift + local out + out=$({ /usr/bin/time -p "$@" > "$stdout_file"; } 2>&1) + local real_sec=$(echo "$out" | awk '/^real /{print $2}') + if [[ -z "$real_sec" || ! "$real_sec" =~ ^[0-9]+(\.[0-9]+)?$ ]]; then + echo "(error)" + return 1 + fi + sec_to_ms "$real_sec" +} + +# Run a SystemDS query and compute statistics +run_systemds_avg() { + local dml="$1" + # Optional second parameter: path to write an error message if the test-run fails + local err_out_file="${2:-}" + local shell_times=() + local core_times=() + local core_have=false + + # Change to project root directory so relative paths in DML work correctly + local original_dir="$(pwd)" + cd "$PROJECT_ROOT" + + # First, test run to validate the query (avoids timing zero or errors later) + tmp_test=$(mktemp) + if $RUN_STATS; then + if ! "$SYSTEMDS_CMD" "$dml" -stats "${SYS_EXTRA_ARGS[@]}" "${NVARGS[@]}" > "$tmp_test" 2>&1; then + err_msg=$(sed -n '1,200p' "$tmp_test" | tr '\n' ' ') + echo "Error: SystemDS test run failed for $dml: $err_msg" >&2 + # Write error message to provided error file for JSON capture + if [[ -n "$err_out_file" ]]; then printf '%s' "$err_msg" > "$err_out_file" || true; fi + rm -f "$tmp_test" + echo "(error)|0|0|(n/a)|0|0" + cd "$original_dir"; return + fi + err_msg=$(sed -n '/An Error Occurred :/,$ p' "$tmp_test" | sed -n '1,200p' | tr '\n' ' ') + if [[ -n "$err_msg" ]]; then + echo "Error: SystemDS reported runtime error for $dml: $err_msg" >&2 + if [[ -n "$err_out_file" ]]; then printf '%s' "$err_msg" > "$err_out_file" || true; fi + rm -f "$tmp_test" + echo "(error)|0|0|(n/a)|0|0" + cd "$original_dir"; return + fi + else + if ! "$SYSTEMDS_CMD" "$dml" "${SYS_EXTRA_ARGS[@]}" "${NVARGS[@]}" > "$tmp_test" 2>&1; then + err_msg=$(sed -n '1,200p' "$tmp_test" | tr '\n' ' ') + echo "Error: SystemDS test run failed for $dml: $err_msg" >&2 + if [[ -n "$err_out_file" ]]; then printf '%s' "$err_msg" > "$err_out_file" || true; fi + rm -f "$tmp_test" + echo "(error)|0|0|(n/a)|0|0" + cd "$original_dir"; return + fi + err_msg=$(sed -n '/An Error Occurred :/,$ p' "$tmp_test" | sed -n '1,200p' | tr '\n' ' ') + if [[ -n "$err_msg" ]]; then + echo "Error: SystemDS reported runtime error for $dml: $err_msg" >&2 + if [[ -n "$err_out_file" ]]; then printf '%s' "$err_msg" > "$err_out_file" || true; fi + rm -f "$tmp_test" + echo "(error)|0|0|(n/a)|0|0" + cd "$original_dir"; return + fi + fi + rm -f "$tmp_test" + + # Warmup runs + for ((w=1; w<=WARMUP; w++)); do + if $RUN_STATS; then + "$SYSTEMDS_CMD" "$dml" -stats "${SYS_EXTRA_ARGS[@]}" "${NVARGS[@]}" > /dev/null 2>&1 || true + else + "$SYSTEMDS_CMD" "$dml" "${SYS_EXTRA_ARGS[@]}" "${NVARGS[@]}" > /dev/null 2>&1 || true + fi + done + + # Timed runs - collect all measurements + for ((i=1; i<=REPEATS; i++)); do + if $RUN_STATS; then + local shell_ms + local temp_file + temp_file=$(mktemp) + shell_ms=$(time_command_ms_capture "$temp_file" "$SYSTEMDS_CMD" "$dml" -stats "${SYS_EXTRA_ARGS[@]}" "${NVARGS[@]}") || { + rm -f "$temp_file"; cd "$original_dir"; echo "(error)|0|0|(n/a)|0|0"; return; } + shell_times+=("$shell_ms") + + # Extract SystemDS internal timing from the same run + local internal_sec + internal_sec=$(awk '/Total execution time:/ {print $4}' "$temp_file" | tail -1 || true) + rm -f "$temp_file" + if [[ -n "$internal_sec" ]] && [[ "$internal_sec" =~ ^[0-9]+(\.[0-9]+)?$ ]]; then + local core_ms + core_ms=$(awk -v sec="$internal_sec" 'BEGIN{printf "%.1f", sec * 1000}') + core_times+=("$core_ms") + core_have=true + fi + else + local shell_ms + shell_ms=$(time_command_ms "$SYSTEMDS_CMD" "$dml" "${SYS_EXTRA_ARGS[@]}" "${NVARGS[@]}") || { cd "$original_dir"; echo "(error)|0|0|(n/a)|0|0"; return; } + shell_times+=("$shell_ms") + fi + done + + # Return to original directory + cd "$original_dir" + + # Calculate statistics for shell times + local shell_stats + shell_stats=$(calculate_statistics "${shell_times[@]}") + + # Calculate statistics for core times if available + local core_stats + if $RUN_STATS && $core_have && [[ ${#core_times[@]} -gt 0 ]]; then + core_stats=$(calculate_statistics "${core_times[@]}") + else + core_stats="(n/a)|0|0" + fi + + echo "$shell_stats|$core_stats" +} + +# Run a PostgreSQL query and compute statistics +run_psql_avg_ms() { + local sql_file="$1" + + # Check if PostgreSQL is available + if [[ -z "$PSQL_EXEC" ]]; then + echo "(unavailable)|0|0|(n/a)|0|0" + return + fi + + # Test run first + "$PSQL_EXEC" -U "$POSTGRES_USER" -h "$POSTGRES_HOST" -d "$POSTGRES_DB" \ + -v ON_ERROR_STOP=1 -q \ + -c "SET max_parallel_workers=0; SET max_parallel_maintenance_workers=0; SET max_parallel_workers_per_gather=0; SET parallel_leader_participation=off;" \ + -f "$sql_file" >/dev/null 2>/dev/null || { + echo "(error)|0|0|(n/a)|0|0" + return + } + + local shell_times=() + local core_times=() + local core_have=false + + for ((i=1; i<=REPEATS; i++)); do + # Wall-clock shell time + local ms + ms=$(time_command_ms "$PSQL_EXEC" -U "$POSTGRES_USER" -h "$POSTGRES_HOST" -d "$POSTGRES_DB" \ + -v ON_ERROR_STOP=1 -q \ + -c "SET max_parallel_workers=0; SET max_parallel_maintenance_workers=0; SET max_parallel_workers_per_gather=0; SET parallel_leader_participation=off;" \ + -f "$sql_file" 2>/dev/null) || { + echo "(error)|0|0|(n/a)|0|0" + return + } + shell_times+=("$ms") + + # Core execution time using EXPLAIN ANALYZE (if --stats enabled) + if $RUN_STATS; then + local tmp_explain + tmp_explain=$(mktemp) + + # Create EXPLAIN ANALYZE version of the query + echo "SET max_parallel_workers=0; SET max_parallel_maintenance_workers=0; SET max_parallel_workers_per_gather=0; SET parallel_leader_participation=off;" > "$tmp_explain" + echo "EXPLAIN (ANALYZE, BUFFERS, FORMAT TEXT)" >> "$tmp_explain" + cat "$sql_file" >> "$tmp_explain" + + # Execute EXPLAIN ANALYZE and extract execution time + local explain_output core_ms + explain_output=$("$PSQL_EXEC" -U "$POSTGRES_USER" -h "$POSTGRES_HOST" -d "$POSTGRES_DB" \ + -v ON_ERROR_STOP=1 -q -f "$tmp_explain" 2>/dev/null || true) + + if [[ -n "$explain_output" ]]; then + # Extract "Execution Time: X.XXX ms" from EXPLAIN ANALYZE output + local exec_time_ms + exec_time_ms=$(echo "$explain_output" | grep -oE "Execution Time: [0-9]+\.[0-9]+" | grep -oE "[0-9]+\.[0-9]+" | head -1 || true) + + if [[ -n "$exec_time_ms" ]] && [[ "$exec_time_ms" =~ ^[0-9]+(\.[0-9]+)?$ ]]; then + core_ms=$(awk -v ms="$exec_time_ms" 'BEGIN{printf "%.1f", ms}') + core_times+=("$core_ms") + core_have=true + fi + fi + + rm -f "$tmp_explain" + fi + done + + # Build outputs + local shell_stats core_stats + shell_stats=$(calculate_statistics "${shell_times[@]}") + if $RUN_STATS && $core_have && [[ ${#core_times[@]} -gt 0 ]]; then + core_stats=$(calculate_statistics "${core_times[@]}") + else + core_stats="(n/a)|0|0" + fi + echo "$shell_stats|$core_stats" +} + +# Run a DuckDB query and compute statistics +run_duckdb_avg_ms() { + local sql_file="$1" + + # Check if DuckDB is available + if [[ -z "$DUCKDB_EXEC" ]]; then + echo "(unavailable)|0|0|(n/a)|0|0" + return + fi + + # Test run with minimal setup (no profiling) + local tmp_test + tmp_test=$(mktemp) + printf 'PRAGMA threads=1;\n' > "$tmp_test" + cat "$sql_file" >> "$tmp_test" + "$DUCKDB_EXEC" "$DUCKDB_DB_PATH" < "$tmp_test" >/dev/null 2>&1 || { + rm -f "$tmp_test" + echo "(error)|0|0|(n/a)|0|0" + return + } + rm -f "$tmp_test" + + local shell_times=() + local core_times=() + local core_have=false + + for ((i=1; i<=REPEATS; i++)); do + local tmp_sql iter_json + tmp_sql=$(mktemp) + if $RUN_STATS; then + # Enable JSON profiling per-run and write to a temporary file + iter_json=$(mktemp -t duckprof.XXXXXX).json + cat > "$tmp_sql" < "$tmp_sql" + fi + cat "$sql_file" >> "$tmp_sql" + + # Wall-clock shell time + local ms + ms=$(time_command_ms "$DUCKDB_EXEC" "$DUCKDB_DB_PATH" < "$tmp_sql") || { + rm -f "$tmp_sql" ${iter_json:+"$iter_json"} + echo "(error)|0|0|(n/a)|0|0" + return + } + shell_times+=("$ms") + + # Parse core latency from JSON profile if available + if $RUN_STATS && [[ -n "${iter_json:-}" && -f "$iter_json" ]]; then + local core_sec + if command -v jq >/dev/null 2>&1; then + core_sec=$(jq -r '.latency // empty' "$iter_json" 2>/dev/null || true) + else + core_sec=$(grep -oE '"latency"\s*:\s*[0-9.]+' "$iter_json" 2>/dev/null | sed -E 's/.*:\s*//' | head -1 || true) + fi + if [[ -n "$core_sec" ]] && [[ "$core_sec" =~ ^[0-9]+(\.[0-9]+)?$ ]]; then + local core_ms + core_ms=$(awk -v s="$core_sec" 'BEGIN{printf "%.1f", s*1000}') + core_times+=("$core_ms") + core_have=true + fi + fi + + rm -f "$tmp_sql" ${iter_json:+"$iter_json"} + done + + # Build outputs + local shell_stats core_stats + shell_stats=$(calculate_statistics "${shell_times[@]}") + if $RUN_STATS && $core_have && [[ ${#core_times[@]} -gt 0 ]]; then + core_stats=$(calculate_statistics "${core_times[@]}") + else + core_stats="(n/a)|0|0" + fi + echo "$shell_stats|$core_stats" +} + +# Help function +show_help() { + cat << 'EOF' +Multi-Engine SSB Performance Benchmark Runner v1.0 + +USAGE (from repo root): + scripts/ssb/shell/run_all_perf.sh [OPTIONS] [QUERIES...] + +OPTIONS: + -stats, --stats Enable SystemDS internal statistics collection + -warmup=N, --warmup=N Set number of warmup runs (default: 1) + -repeats=N, --repeats=N Set number of timing repetitions (default: 5) + -seed=N, --seed=N Set random seed for reproducible results (default: auto-generated) + -stacked, --stacked Use stacked, multi-line layout (best for narrow terminals) + -layout=MODE, --layout=MODE Set layout: auto|wide|stacked (default: auto) + Note: --layout=stacked is equivalent to --stacked + --layout=wide forces wide table layout + -input-dir=PATH, --input-dir=PATH Specify custom data directory (default: $PROJECT_ROOT/data) + -output-dir=PATH, --output-dir=PATH Specify custom output directory (default: $PROJECT_ROOT/scripts/ssb/shell/ssbOutputData/PerformanceData) + -h, -help, --help, --h Show this help message + -v, -version, --version, --v Show version information + +QUERIES: + If no queries are specified, all available SSB queries (q*.dml) will be executed. + To run specific queries, provide their names (with or without .dml extension): + scripts/ssb/shell/run_all_perf.sh q1.1 q2.3 q4.1 + +EXAMPLES (from repo root): + scripts/ssb/shell/run_all_perf.sh # Run full benchmark with all engines + scripts/ssb/shell/run_all_perf.sh --warmup=3 --repeats=10 # Custom warmup and repetition settings + scripts/ssb/shell/run_all_perf.sh -warmup=3 -repeats=10 # Same with single dashes + scripts/ssb/shell/run_all_perf.sh --stats # Enable SystemDS internal timing + scripts/ssb/shell/run_all_perf.sh --layout=wide # Force wide table layout + scripts/ssb/shell/run_all_perf.sh --stacked # Force stacked layout for narrow terminals + scripts/ssb/shell/run_all_perf.sh q1.1 q2.3 # Benchmark specific queries only + scripts/ssb/shell/run_all_perf.sh --seed=12345 # Reproducible benchmark run + scripts/ssb/shell/run_all_perf.sh --input-dir=/path/to/data # Custom data directory + scripts/ssb/shell/run_all_perf.sh -input-dir=/path/to/data # Same as above (single dash) + scripts/ssb/shell/run_all_perf.sh --output-dir=/tmp/results # Custom output directory + scripts/ssb/shell/run_all_perf.sh -output-dir=/tmp/results # Same as above (single dash) + +ENGINES: + - SystemDS: Machine learning platform with DML queries + - PostgreSQL: Industry-standard relational database (if available) + - DuckDB: High-performance analytical database (if available) + +OUTPUT: + Results are saved in CSV and JSON formats with comprehensive metadata: + - Performance timing statistics (mean, stdev, p95) + - Engine comparison and fastest detection + - System information and run configuration + +STATISTICAL OUTPUT FORMAT: + 1824 (±10, p95:1840) + │ │ └── 95th percentile (worst-case bound) + │ └── Standard deviation (consistency measure) + └── Mean execution time (typical performance) + +For more information, see the documentation in scripts/ssb/README.md +EOF +} + +# Parse arguments +RUN_STATS=false +QUERIES=() +SEED="" +LAYOUT="auto" +INPUT_DIR="" +OUTPUT_DIR="" + +# Support both --opt=value and --opt value forms +EXPECT_OPT="" +for arg in "$@"; do + if [[ -n "$EXPECT_OPT" ]]; then + case "$EXPECT_OPT" in + seed) + SEED="$arg" + EXPECT_OPT="" + continue + ;; + input-dir) + INPUT_DIR="$arg" + EXPECT_OPT="" + continue + ;; + output-dir) + OUTPUT_DIR="$arg" + EXPECT_OPT="" + continue + ;; + warmup) + WARMUP="$arg" + if ! [[ "$WARMUP" =~ ^[0-9]+$ ]] || [[ "$WARMUP" -lt 0 ]]; then + echo "Error: --warmup requires a non-negative integer (e.g., --warmup 2)" >&2 + exit 1 + fi + EXPECT_OPT="" + continue + ;; + repeats) + REPEATS="$arg" + if ! [[ "$REPEATS" =~ ^[0-9]+$ ]] || [[ "$REPEATS" -lt 1 ]]; then + echo "Error: --repeats requires a positive integer (e.g., --repeats 5)" >&2 + exit 1 + fi + EXPECT_OPT="" + continue + ;; + layout) + LAYOUT="$arg" + if [[ "$LAYOUT" != "auto" && "$LAYOUT" != "wide" && "$LAYOUT" != "stacked" ]]; then + echo "Error: --layout requires one of: auto, wide, stacked (e.g., --layout wide)" >&2 + exit 1 + fi + EXPECT_OPT="" + continue + ;; + esac + fi + + if [[ "$arg" == "--help" || "$arg" == "-help" || "$arg" == "-h" || "$arg" == "--h" ]]; then + show_help + exit 0 + elif [[ "$arg" == "--version" || "$arg" == "-version" || "$arg" == "-v" || "$arg" == "--v" ]]; then + echo "Multi-Engine SSB Performance Benchmark Runner v1.0" + echo "First Public Release: September 5, 2025" + exit 0 + elif [[ "$arg" == "--stats" || "$arg" == "-stats" ]]; then + RUN_STATS=true + elif [[ "$arg" == --seed=* || "$arg" == -seed=* ]]; then + SEED="${arg#*seed=}" + elif [[ "$arg" == "--seed" || "$arg" == "-seed" ]]; then + EXPECT_OPT="seed" + elif [[ "$arg" == --warmup=* || "$arg" == -warmup=* ]]; then + WARMUP="${arg#*warmup=}" + if ! [[ "$WARMUP" =~ ^[0-9]+$ ]] || [[ "$WARMUP" -lt 0 ]]; then + echo "Error: -warmup/--warmup requires a non-negative integer (e.g., -warmup=2)" >&2 + exit 1 + fi + elif [[ "$arg" == --input-dir=* || "$arg" == -input-dir=* ]]; then + INPUT_DIR="${arg#*input-dir=}" + elif [[ "$arg" == "--input-dir" || "$arg" == "-input-dir" ]]; then + EXPECT_OPT="input-dir" + elif [[ "$arg" == --output-dir=* || "$arg" == -output-dir=* ]]; then + OUTPUT_DIR="${arg#*output-dir=}" + elif [[ "$arg" == "--output-dir" || "$arg" == "-output-dir" ]]; then + EXPECT_OPT="output-dir" + elif [[ "$arg" == "--warmup" || "$arg" == "-warmup" ]]; then + EXPECT_OPT="warmup" + elif [[ "$arg" == --repeats=* || "$arg" == -repeats=* ]]; then + REPEATS="${arg#*repeats=}" + if ! [[ "$REPEATS" =~ ^[0-9]+$ ]] || [[ "$REPEATS" -lt 1 ]]; then + echo "Error: -repeats/--repeats requires a positive integer (e.g., -repeats=5)" >&2 + exit 1 + fi + elif [[ "$arg" == "--repeats" || "$arg" == "-repeats" ]]; then + EXPECT_OPT="repeats" + elif [[ "$arg" == "--stacked" || "$arg" == "-stacked" ]]; then + LAYOUT="stacked" + elif [[ "$arg" == --layout=* || "$arg" == -layout=* ]]; then + LAYOUT="${arg#*layout=}" + if [[ "$LAYOUT" != "auto" && "$LAYOUT" != "wide" && "$LAYOUT" != "stacked" ]]; then + echo "Error: -layout/--layout requires one of: auto, wide, stacked (e.g., --layout=wide)" >&2 + exit 1 + fi + elif [[ "$arg" == "--layout" || "$arg" == "-layout" ]]; then + EXPECT_OPT="layout" + else + # Check if argument looks like an unrecognized option (starts with dash) + if [[ "$arg" == -* ]]; then + echo "Error: Unrecognized option '$arg'" >&2 + echo "Use --help or -h to see available options." >&2 + exit 1 + else + # Treat as query name + QUERIES+=( "$(echo "$arg" | tr '.' '_')" ) + fi + fi + done + +# If the last option expected a value but none was provided +if [[ -n "$EXPECT_OPT" ]]; then + case "$EXPECT_OPT" in + seed) echo "Error: -seed/--seed requires a value (e.g., -seed=12345)" >&2 ;; + warmup) echo "Error: -warmup/--warmup requires a value (e.g., -warmup=2)" >&2 ;; + repeats) echo "Error: -repeats/--repeats requires a value (e.g., -repeats=5)" >&2 ;; + layout) echo "Error: -layout/--layout requires a value (e.g., -layout=wide)" >&2 ;; + esac + exit 1 +fi + +# Generate seed if not provided +if [[ -z "$SEED" ]]; then + SEED=$((RANDOM * 32768 + RANDOM)) +fi +if [[ ${#QUERIES[@]} -eq 0 ]]; then + for f in "$QUERY_DIR"/q*.dml; do + [[ -e "$f" ]] || continue + bname="$(basename "$f")" + QUERIES+=( "${bname%.dml}" ) + done +fi + +# Set data directory +if [[ -z "$INPUT_DIR" ]]; then + INPUT_DIR="$PROJECT_ROOT/data" +fi + +# Set output directory +if [[ -z "$OUTPUT_DIR" ]]; then + OUTPUT_DIR="$PROJECT_ROOT/scripts/ssb/shell/ssbOutputData/PerformanceData" +fi + +# Normalize paths by removing trailing slashes +INPUT_DIR="${INPUT_DIR%/}" +OUTPUT_DIR="${OUTPUT_DIR%/}" + +# Pass input directory to DML scripts via SystemDS named arguments +NVARGS=( -nvargs "input_dir=${INPUT_DIR}" ) + +# Validate data directory +if [[ ! -d "$INPUT_DIR" ]]; then + echo "Error: Data directory '$INPUT_DIR' does not exist." >&2 + echo "Please ensure the directory exists or specify a different path with -input-dir." >&2 + exit 1 +fi + +# Ensure output directory exists +mkdir -p "$OUTPUT_DIR" + +# Metadata collection functions +collect_system_metadata() { + local timestamp hostname systemds_version jdk_version postgres_version duckdb_version cpu_info ram_info + + # Basic system info + timestamp=$(date -u '+%Y-%m-%d %H:%M:%S UTC') + hostname=$(hostname 2>/dev/null || echo "unknown") + + # SystemDS version + if [[ -x "$SYSTEMDS_CMD" ]]; then + # Try to get version from pom.xml first + if [[ -f "$PROJECT_ROOT/pom.xml" ]]; then + systemds_version=$(grep -A1 'org.apache.systemds' "$PROJECT_ROOT/pom.xml" | grep '' | sed 's/.*\(.*\)<\/version>.*/\1/' | head -1 2>/dev/null || echo "unknown") + else + systemds_version="unknown" + fi + + # If pom.xml method failed, try alternative methods + if [[ "$systemds_version" == "unknown" ]]; then + # Try to extract from SystemDS JAR manifest + if [[ -f "$PROJECT_ROOT/target/systemds.jar" ]]; then + systemds_version=$(unzip -p "$PROJECT_ROOT/target/systemds.jar" META-INF/MANIFEST.MF 2>/dev/null | grep "Implementation-Version" | cut -d: -f2 | tr -d ' ' || echo "unknown") + else + # Try to find any SystemDS JAR and extract version + local jar_file=$(find "$PROJECT_ROOT" -name "systemds*.jar" | head -1 2>/dev/null) + if [[ -n "$jar_file" ]]; then + systemds_version=$(unzip -p "$jar_file" META-INF/MANIFEST.MF 2>/dev/null | grep "Implementation-Version" | cut -d: -f2 | tr -d ' ' || echo "unknown") + else + systemds_version="unknown" + fi + fi + fi + else + systemds_version="unknown" + fi + + # JDK version + if command -v java >/dev/null 2>&1; then + jdk_version=$(java -version 2>&1 | grep -v "Picked up" | head -1 | sed 's/.*"\(.*\)".*/\1/' || echo "unknown") + else + jdk_version="unknown" + fi + + # PostgreSQL version + if command -v psql >/dev/null 2>&1; then + postgres_version=$(psql --version 2>/dev/null | head -1 || echo "not available") + else + postgres_version="not available" + fi + + # DuckDB version + if command -v duckdb >/dev/null 2>&1; then + duckdb_version=$(duckdb --version 2>/dev/null || echo "not available") + else + duckdb_version="not available" + fi + + # System resources + if [[ "$(uname)" == "Darwin" ]]; then + # macOS + cpu_info=$(sysctl -n machdep.cpu.brand_string 2>/dev/null || echo "unknown") + ram_info=$(( $(sysctl -n hw.memsize 2>/dev/null || echo 0) / 1024 / 1024 / 1024 ))GB + else + # Linux + cpu_info=$(grep "model name" /proc/cpuinfo | head -1 | cut -d: -f2- | sed 's/^ *//' 2>/dev/null || echo "unknown") + ram_info=$(( $(grep MemTotal /proc/meminfo | awk '{print $2}' 2>/dev/null || echo 0) / 1024 / 1024 ))GB + fi + + # Store metadata globally + RUN_TIMESTAMP="$timestamp" + RUN_HOSTNAME="$hostname" + RUN_SYSTEMDS_VERSION="$systemds_version" + RUN_JDK_VERSION="$jdk_version" + RUN_POSTGRES_VERSION="$postgres_version" + RUN_DUCKDB_VERSION="$duckdb_version" + RUN_CPU_INFO="$cpu_info" + RUN_RAM_INFO="$ram_info" +} + +collect_data_metadata() { + # Check for SSB data directory and get basic stats + local ssb_data_dir="$INPUT_DIR" + local json_parts=() + local display_parts=() + + if [[ -d "$ssb_data_dir" ]]; then + # Try to get row counts from data files (if they exist) + for table in customer part supplier date; do + local file="$ssb_data_dir/${table}.tbl" + if [[ -f "$file" ]]; then + local count=$(wc -l < "$file" 2>/dev/null | tr -d ' ' || echo "0") + json_parts+=(" \"$table\": \"$count\"") + display_parts+=("$table:$count") + fi + done + # Check for any lineorder*.tbl file (SSB fact table) + local lineorder_file=$(find "$ssb_data_dir" -name "lineorder*.tbl" -type f | head -1) + if [[ -n "$lineorder_file" && -f "$lineorder_file" ]]; then + local count=$(wc -l < "$lineorder_file" 2>/dev/null | tr -d ' ' || echo "0") + json_parts+=(" \"lineorder\": \"$count\"") + display_parts+=("lineorder:$count") + fi + fi + + if [[ ${#json_parts[@]} -eq 0 ]]; then + RUN_DATA_INFO='"No data files found"' + RUN_DATA_DISPLAY="No data files found" + else + # Join array elements with commas and newlines, wrap in braces for JSON + local formatted_json="{\n" + for i in "${!json_parts[@]}"; do + formatted_json+="${json_parts[$i]}" + if [[ $i -lt $((${#json_parts[@]} - 1)) ]]; then + formatted_json+=",\n" + else + formatted_json+="\n" + fi + done + formatted_json+=" }" + RUN_DATA_INFO="$formatted_json" + + # Join with spaces for display + local IFS=" " + RUN_DATA_DISPLAY="${display_parts[*]}" + fi +} + +print_metadata_header() { + echo "==================================================================================" + echo " MULTI-ENGINE PERFORMANCE BENCHMARK METADATA" + echo "==================================================================================" + echo "Timestamp: $RUN_TIMESTAMP" + echo "Hostname: $RUN_HOSTNAME" + echo "Seed: $SEED" + echo + echo "Software Versions:" + echo " SystemDS: $RUN_SYSTEMDS_VERSION" + echo " JDK: $RUN_JDK_VERSION" + echo " PostgreSQL: $RUN_POSTGRES_VERSION" + echo " DuckDB: $RUN_DUCKDB_VERSION" + echo + echo "System Resources:" + echo " CPU: $RUN_CPU_INFO" + echo " RAM: $RUN_RAM_INFO" + echo + echo "Data Build Info:" + echo " SSB Data: $RUN_DATA_DISPLAY" + echo + echo "Run Configuration:" + echo " Statistics: $(if $RUN_STATS; then echo "enabled"; else echo "disabled"; fi)" + echo " Queries: ${#QUERIES[@]} selected" + echo " Warmup Runs: $WARMUP" + echo " Repeat Runs: $REPEATS" + echo "==================================================================================" + echo +} + +# Progress indicator function +progress_indicator() { + local query_name="$1" + local stage="$2" + # Use terminal width for proper clearing, fallback to 120 chars if tput fails + local term_width + term_width=$(tput cols 2>/dev/null || echo 120) + local spaces=$(printf "%*s" "$term_width" "") + echo -ne "\r$spaces\r$query_name: Running $stage..." +} + +# Clear progress line function +clear_progress() { + local term_width + term_width=$(tput cols 2>/dev/null || echo 120) + local spaces=$(printf "%*s" "$term_width" "") + echo -ne "\r$spaces\r" +} + +# Main execution +# Collect metadata +collect_system_metadata +collect_data_metadata + +# Print metadata header +print_metadata_header + +verify_environment +echo +echo "NOTE (macOS): You cannot drop OS caches like Linux (sync; echo 3 > /proc/sys/vm/drop_caches)." +echo "We mitigate with warm-up runs and repeated averages to ensure consistent measurements." +echo +echo "INTERPRETATION GUIDE:" +echo "- SystemDS Shell (ms): Total execution time including JVM startup, I/O, and computation" +echo "- SystemDS Core (ms): Pure computation time excluding JVM overhead (only with --stats)" +echo "- PostgreSQL (ms): Single-threaded execution time with parallel workers disabled" +echo "- PostgreSQL Core (ms): Query execution time from EXPLAIN ANALYZE (only with --stats)" +echo "- DuckDB (ms): Single-threaded execution time with threads=1 pragma" +echo "- DuckDB Core (ms): Engine-internal latency from JSON profiling (with --stats)" +echo "- (missing): SQL file not found for this query" +echo "- (n/a): Core timing unavailable (run with --stats flag for internal timing)" +echo +echo "NOTE: All engines use single-threaded execution for fair comparison." +echo " Multiple runs with averaging provide statistical reliability." +echo +echo "Single-threaded execution; warm-up runs: $WARMUP, timed runs: $REPEATS" +echo "Row 1 shows mean (ms); Row 2 shows ±stdev/CV; Row 3 shows p95 (ms)." +echo "Core execution times available for all engines with --stats flag." +echo +term_width=$(tput cols 2>/dev/null || echo 120) +if [[ "$LAYOUT" == "auto" ]]; then + if [[ $term_width -ge 140 ]]; then + LAYOUT_MODE="wide" + else + LAYOUT_MODE="stacked" + fi +else + LAYOUT_MODE="$LAYOUT" +fi + +# If the user requested wide layout but the terminal is too narrow, fall back to stacked +if [[ "$LAYOUT_MODE" == "wide" ]]; then + # compute total printable width: sum(widths) + 3*cols + 1 (accounting for separators) + sumw=0 + for w in "${WIDE_COL_WIDTHS[@]}"; do sumw=$((sumw + w)); done + cols=${#WIDE_COL_WIDTHS[@]} + total_width=$((sumw + 3*cols + 1)) + if [[ $total_width -gt $term_width ]]; then + # Try to scale columns down proportionally to fit terminal width + reserved=$((3*cols + 1)) + avail=$((term_width - reserved)) + if [[ $avail -le 0 ]]; then + : + else + # Minimum sensible widths per column (keep labels readable) + MIN_COL_WIDTHS=(6 8 8 6 10 6 6 16) + # Start with proportional distribution + declare -a new_widths=() + for w in "${WIDE_COL_WIDTHS[@]}"; do + nw=$(( w * avail / sumw )) + if [[ $nw -lt 1 ]]; then nw=1; fi + new_widths+=("$nw") + done + # Enforce minimums + sum_new=0 + for i in "${!new_widths[@]}"; do + if [[ ${new_widths[i]} -lt ${MIN_COL_WIDTHS[i]:-4} ]]; then + new_widths[i]=${MIN_COL_WIDTHS[i]:-4} + fi + sum_new=$((sum_new + new_widths[i])) + done + # If even minimums exceed available, fallback to stacked + if [[ $sum_new -gt $avail ]]; then + : + else + # Distribute remaining columns' widths left-to-right + rem=$((avail - sum_new)) + i=0 + while [[ $rem -gt 0 ]]; do + new_widths[i]=$((new_widths[i] + 1)) + rem=$((rem - 1)) + i=$(( (i + 1) % cols )) + done + # Replace WIDE_COL_WIDTHS with the scaled values for printing + WIDE_COL_WIDTHS=("${new_widths[@]}") + # Recompute total_width for logging + sumw=0 + for w in "${WIDE_COL_WIDTHS[@]}"; do sumw=$((sumw + w)); done + total_width=$((sumw + reserved)) + echo "Info: scaled wide layout to fit terminal ($term_width cols): table width $total_width" + fi + fi + fi +fi + +if [[ "$LAYOUT_MODE" == "wide" ]]; then + grid_line_wide + grid_row_wide \ + "Query" \ + "SysDS Shell" "SysDS Core" \ + "PostgreSQL" "PostgreSQL Core" \ + "DuckDB" "DuckDB Core" \ + "Fastest" + grid_row_wide "" "mean" "mean" "mean" "mean" "mean" "mean" "" + grid_row_wide "" "±/CV" "±/CV" "±/CV" "±/CV" "±/CV" "±/CV" "" + grid_row_wide "" "p95" "p95" "p95" "p95" "p95" "p95" "" + grid_line_wide +else + echo "================================================================================" + echo "Stacked layout (use --layout=wide for table view)." + echo "Row 1 shows mean (ms); Row 2 shows (±stdev/CV, p95)." + echo "--------------------------------------------------------------------------------" +fi +# Prepare output file paths and write CSV header with comprehensive metadata +# Ensure results directory exists and create timestamped filenames +RESULT_DIR="$OUTPUT_DIR" +mkdir -p "$RESULT_DIR" +RESULT_BASENAME="ssb_results_$(date -u +%Y%m%dT%H%M%SZ)" +RESULT_CSV="$RESULT_DIR/${RESULT_BASENAME}.csv" +RESULT_JSON="$RESULT_DIR/${RESULT_BASENAME}.json" + +{ + echo "# Multi-Engine Performance Benchmark Results" + echo "# Timestamp: $RUN_TIMESTAMP" + echo "# Hostname: $RUN_HOSTNAME" + echo "# Seed: $SEED" + echo "# SystemDS: $RUN_SYSTEMDS_VERSION" + echo "# JDK: $RUN_JDK_VERSION" + echo "# PostgreSQL: $RUN_POSTGRES_VERSION" + echo "# DuckDB: $RUN_DUCKDB_VERSION" + echo "# CPU: $RUN_CPU_INFO" + echo "# RAM: $RUN_RAM_INFO" + echo "# Data: $RUN_DATA_DISPLAY" + echo "# Warmup: $WARMUP, Repeats: $REPEATS" + echo "# Statistics: $(if $RUN_STATS; then echo "enabled"; else echo "disabled"; fi)" + echo "#" + echo "query,systemds_shell_display,systemds_shell_mean,systemds_shell_stdev,systemds_shell_p95,systemds_core_display,systemds_core_mean,systemds_core_stdev,systemds_core_p95,postgres_display,postgres_mean,postgres_stdev,postgres_p95,postgres_core_display,postgres_core_mean,postgres_core_stdev,postgres_core_p95,duckdb_display,duckdb_mean,duckdb_stdev,duckdb_p95,duckdb_core_display,duckdb_core_mean,duckdb_core_stdev,duckdb_core_p95,fastest" +} > "$RESULT_CSV" +for base in "${QUERIES[@]}"; do + # Show progress indicator for SystemDS + progress_indicator "$base" "SystemDS" + + dml_path="$QUERY_DIR/${base}.dml" + # Parse SystemDS results: shell_mean|shell_stdev|shell_p95|core_mean|core_stdev|core_p95 + # Capture potential SystemDS test-run error messages for JSON reporting + tmp_err_msg=$(mktemp) + systemds_result="$(run_systemds_avg "$dml_path" "$tmp_err_msg")" + # Read any captured error message + sysds_err_text="$(sed -n '1,200p' "$tmp_err_msg" 2>/dev/null | tr '\n' ' ' || true)" + rm -f "$tmp_err_msg" + IFS='|' read -r sd_shell_mean sd_shell_stdev sd_shell_p95 sd_core_mean sd_core_stdev sd_core_p95 <<< "$systemds_result" + + # Format SystemDS results for display + if [[ "$sd_shell_mean" =~ ^[0-9]+(\.[0-9]+)?$ ]]; then + sd_shell_display=$(format_statistics "$sd_shell_mean" "$sd_shell_stdev" "$sd_shell_p95" "$REPEATS") + else + sd_shell_display="$sd_shell_mean" + sd_shell_stdev="0" + sd_shell_p95="0" + fi + if [[ "$sd_core_mean" == "(n/a)" ]]; then + sd_core_display="(n/a)" + else + sd_core_display=$(format_statistics "$sd_core_mean" "$sd_core_stdev" "$sd_core_p95" "$REPEATS") + fi + + sql_name="${base//_/.}.sql" + sql_path="$SQL_DIR/$sql_name" + pg_display="(missing)" + duck_display="(missing)" + + if [[ -n "$PSQL_EXEC" && -f "$sql_path" ]]; then + progress_indicator "$base" "PostgreSQL" + pg_result="$(run_psql_avg_ms "$sql_path")" + IFS='|' read -r pg_mean pg_stdev pg_p95 pg_core_mean pg_core_stdev pg_core_p95 <<< "$pg_result" + if [[ "$pg_mean" == "(unavailable)" || "$pg_mean" == "(error)" ]]; then + pg_display="$pg_mean" + pg_core_display="$pg_mean" + pg_stdev="0" + pg_p95="0" + pg_core_mean="(n/a)" + pg_core_stdev="0" + pg_core_p95="0" + else + pg_display=$(format_statistics "$pg_mean" "$pg_stdev" "$pg_p95" "$REPEATS") + if [[ "$pg_core_mean" != "(n/a)" ]]; then + pg_core_display=$(format_statistics "$pg_core_mean" "$pg_core_stdev" "$pg_core_p95" "$REPEATS") + else + pg_core_display="(n/a)" + fi + fi + elif [[ -z "$PSQL_EXEC" ]]; then + pg_display="(unavailable)" + pg_core_display="(unavailable)" + pg_mean="(unavailable)" + pg_core_mean="(unavailable)" + pg_stdev="0" + pg_p95="0" + pg_core_stdev="0" + pg_core_p95="0" + else + pg_display="(missing)" + pg_core_display="(missing)" + pg_mean="(missing)" + pg_core_mean="(missing)" + pg_stdev="0" + pg_p95="0" + pg_core_stdev="0" + pg_core_p95="0" + fi + + if [[ -n "$DUCKDB_EXEC" && -f "$sql_path" ]]; then + progress_indicator "$base" "DuckDB" + duck_result="$(run_duckdb_avg_ms "$sql_path")" + IFS='|' read -r duck_mean duck_stdev duck_p95 duck_core_mean duck_core_stdev duck_core_p95 <<< "$duck_result" + if [[ "$duck_mean" == "(unavailable)" || "$duck_mean" == "(error)" ]]; then + duck_display="$duck_mean" + duck_stdev="0" + duck_p95="0" + duck_core_display="(n/a)" + duck_core_mean="(n/a)" + duck_core_stdev="0" + duck_core_p95="0" + else + duck_display=$(format_statistics "$duck_mean" "$duck_stdev" "$duck_p95" "$REPEATS") + if [[ "$duck_core_mean" == "(n/a)" ]]; then + duck_core_display="(n/a)" + else + duck_core_display=$(format_statistics "$duck_core_mean" "$duck_core_stdev" "$duck_core_p95" "$REPEATS") + fi + fi + elif [[ -z "$DUCKDB_EXEC" ]]; then + duck_display="(unavailable)" + duck_mean="(unavailable)" + duck_stdev="0" + duck_p95="0" + duck_core_display="(unavailable)" + duck_core_mean="(unavailable)" + duck_core_stdev="0" + duck_core_p95="0" + else + duck_display="(missing)" + duck_mean="(missing)" + duck_stdev="0" + duck_p95="0" + duck_core_display="(missing)" + duck_core_mean="(missing)" + duck_core_stdev="0" + duck_core_p95="0" + fi + + # Determine fastest engine based on mean values + fastest="" + min_ms=999999999 + for engine in systemds pg duck; do + val="" + eng_name="" + case "$engine" in + systemds) val="$sd_shell_mean"; eng_name="SystemDS";; + pg) val="$pg_mean"; eng_name="PostgreSQL";; + duck) val="$duck_mean"; eng_name="DuckDB";; + esac + # Check if value is a valid number (including decimal) + if [[ "$val" =~ ^[0-9]+(\.[0-9]+)?$ ]]; then + # Use awk for floating point comparison + if [[ $(awk -v val="$val" -v min="$min_ms" 'BEGIN{print (val < min)}') -eq 1 ]]; then + min_ms=$(awk -v val="$val" 'BEGIN{printf "%.1f", val}') + fastest="$eng_name" + elif [[ $(awk -v val="$val" -v min="$min_ms" 'BEGIN{print (val == min)}') -eq 1 ]] && [[ -n "$fastest" ]]; then + fastest="$fastest+$eng_name" # Show ties + fi + fi + done + [[ -z "$fastest" ]] && fastest="(n/a)" + + # Determine SystemDS per-query status and include any error message captured + systemds_status="success" + systemds_error_message=null + if [[ "$sd_shell_mean" == "(error)" ]] || [[ -n "$sysds_err_text" ]]; then + systemds_status="error" + if [[ -n "$sysds_err_text" ]]; then + # Escape quotes for JSON embedding + esc=$(printf '%s' "$sysds_err_text" | sed -e 's/"/\\"/g') + systemds_error_message="\"$esc\"" + else + systemds_error_message="\"SystemDS reported an error during test-run\"" + fi + fi + + # Prepare mean-only and stats-only cells + # Means: use numeric mean when available; otherwise use existing display label (unavailable/missing) + sd_shell_mean_cell=$([[ "$sd_shell_mean" =~ ^[0-9]+(\.[0-9]+)?$ ]] && echo "$sd_shell_mean" || echo "$sd_shell_display") + sd_core_mean_cell=$([[ "$sd_core_mean" =~ ^[0-9]+(\.[0-9]+)?$ ]] && echo "$sd_core_mean" || echo "$sd_core_display") + pg_mean_cell=$([[ "$pg_mean" =~ ^[0-9]+(\.[0-9]+)?$ ]] && echo "$pg_mean" || echo "$pg_display") + pg_core_mean_cell=$([[ "$pg_core_mean" =~ ^[0-9]+(\.[0-9]+)?$ ]] && echo "$pg_core_mean" || echo "$pg_core_display") + duck_mean_cell=$([[ "$duck_mean" =~ ^[0-9]+(\.[0-9]+)?$ ]] && echo "$duck_mean" || echo "$duck_display") + duck_core_mean_cell=$([[ "$duck_core_mean" =~ ^[0-9]+(\.[0-9]+)?$ ]] && echo "$duck_core_mean" || echo "$duck_core_display") + + # Stats lines split: CV and p95 + sd_shell_cv_cell=$(format_cv_only "$sd_shell_mean" "$sd_shell_stdev" "$REPEATS") + sd_core_cv_cell=$(format_cv_only "$sd_core_mean" "$sd_core_stdev" "$REPEATS") + pg_cv_cell=$(format_cv_only "$pg_mean" "$pg_stdev" "$REPEATS") + pg_core_cv_cell=$(format_cv_only "$pg_core_mean" "$pg_core_stdev" "$REPEATS") + duck_cv_cell=$(format_cv_only "$duck_mean" "$duck_stdev" "$REPEATS") + duck_core_cv_cell=$(format_cv_only "$duck_core_mean" "$duck_core_stdev" "$REPEATS") + + sd_shell_p95_cell=$(format_p95_only "$sd_shell_p95" "$REPEATS") + sd_core_p95_cell=$(format_p95_only "$sd_core_p95" "$REPEATS") + pg_p95_cell=$(format_p95_only "$pg_p95" "$REPEATS") + pg_core_p95_cell=$(format_p95_only "$pg_core_p95" "$REPEATS") + duck_p95_cell=$(format_p95_only "$duck_p95" "$REPEATS") + duck_core_p95_cell=$(format_p95_only "$duck_core_p95" "$REPEATS") + + # Clear progress line and display final results + clear_progress + if [[ "$LAYOUT_MODE" == "wide" ]]; then + # Three-line table style with grid separators + grid_row_wide \ + "$base" \ + "$sd_shell_mean_cell" "$sd_core_mean_cell" \ + "$pg_mean_cell" "$pg_core_mean_cell" \ + "$duck_mean_cell" "$duck_core_mean_cell" \ + "$fastest" + grid_row_wide \ + "" \ + "$sd_shell_cv_cell" "$sd_core_cv_cell" \ + "$pg_cv_cell" "$pg_core_cv_cell" \ + "$duck_cv_cell" "$duck_core_cv_cell" \ + "" + grid_row_wide \ + "" \ + "$sd_shell_p95_cell" "$sd_core_p95_cell" \ + "$pg_p95_cell" "$pg_core_p95_cell" \ + "$duck_p95_cell" "$duck_core_p95_cell" \ + "" + grid_line_wide + else + # Stacked layout for narrow terminals + echo "Query : $base Fastest: $fastest" + printf ' %-20s %s\n' "SystemDS Shell:" "$sd_shell_mean_cell" + [[ -n "$sd_shell_cv_cell" ]] && printf ' %-20s %s\n' "" "$sd_shell_cv_cell" + [[ -n "$sd_shell_p95_cell" ]] && printf ' %-20s %s\n' "" "$sd_shell_p95_cell" + printf ' %-20s %s\n' "SystemDS Core:" "$sd_core_mean_cell" + [[ -n "$sd_core_cv_cell" ]] && printf ' %-20s %s\n' "" "$sd_core_cv_cell" + [[ -n "$sd_core_p95_cell" ]] && printf ' %-20s %s\n' "" "$sd_core_p95_cell" + printf ' %-20s %s\n' "PostgreSQL:" "$pg_mean_cell" + [[ -n "$pg_cv_cell" ]] && printf ' %-20s %s\n' "" "$pg_cv_cell" + [[ -n "$pg_p95_cell" ]] && printf ' %-20s %s\n' "" "$pg_p95_cell" + printf ' %-20s %s\n' "PostgreSQL Core:" "$pg_core_mean_cell" + [[ -n "$pg_core_cv_cell" ]] && printf ' %-20s %s\n' "" "$pg_core_cv_cell" + [[ -n "$pg_core_p95_cell" ]] && printf ' %-20s %s\n' "" "$pg_core_p95_cell" + printf ' %-20s %s\n' "DuckDB:" "$duck_mean_cell" + [[ -n "$duck_cv_cell" ]] && printf ' %-20s %s\n' "" "$duck_cv_cell" + [[ -n "$duck_p95_cell" ]] && printf ' %-20s %s\n' "" "$duck_p95_cell" + printf ' %-20s %s\n' "DuckDB Core:" "$duck_core_mean_cell" + [[ -n "$duck_core_cv_cell" ]] && printf ' %-20s %s\n' "" "$duck_core_cv_cell" + [[ -n "$duck_core_p95_cell" ]] && printf ' %-20s %s\n' "" "$duck_core_p95_cell" + echo "--------------------------------------------------------------------------------" + fi + + # Write comprehensive data to CSV + echo "$base,\"$sd_shell_display\",$sd_shell_mean,$sd_shell_stdev,$sd_shell_p95,\"$sd_core_display\",$sd_core_mean,$sd_core_stdev,$sd_core_p95,\"$pg_display\",$pg_mean,$pg_stdev,$pg_p95,\"$pg_core_display\",$pg_core_mean,$pg_core_stdev,$pg_core_p95,\"$duck_display\",$duck_mean,$duck_stdev,$duck_p95,\"$duck_core_display\",$duck_core_mean,$duck_core_stdev,$duck_core_p95,$fastest" >> "$RESULT_CSV" + + # Build JSON entry for this query + json_entry=$(cat < "$RESULT_JSON" + +echo "Results saved to $RESULT_CSV" +echo "Results saved to $RESULT_JSON" diff --git a/scripts/ssb/shell/run_ssb.sh b/scripts/ssb/shell/run_ssb.sh new file mode 100755 index 00000000000..e15e2159a23 --- /dev/null +++ b/scripts/ssb/shell/run_ssb.sh @@ -0,0 +1,856 @@ +#!/usr/bin/env bash +# +# SystemDS Star Schema Benchmark (SSB) Runner +# =========================================== +# +# CORE SCRIPTS STATUS: +# - Version: 1.0 (September 5, 2025) +# - Status: Production-Ready with Advanced User Experience +# - First Public Release: September 5, 2025 +# +# FEATURES IMPLEMENTED: +# ✓ Basic SSB query execution with SystemDS 3.4.0-SNAPSHOT +# ✓ Single-threaded configuration for consistent benchmarking +# ✓ Progress indicators with real-time updates +# ✓ Comprehensive timing measurements using /usr/bin/time +# ✓ Query result extraction (scalar and table formats) +# ✓ Success/failure tracking with detailed reporting +# ✓ Query summary table with execution status +# ✓ "See below" notation with result reprinting (NEW) +# ✓ Long table outputs displayed after summary (NEW) +# ✓ Error handling with timeout protection +# ✓ Cross-platform compatibility (macOS/Linux) +# +# RECENT IMPORTANT ADDITIONS: +# - Accepts --input-dir=PATH and forwards it into DML runs as a SystemDS named +# argument: -nvargs input_dir=/path/to/data (DML can use sys.vinput_dir or +# the named argument to locate data files instead of hardcoded `data/`). +# - Fast-fail on missing input directory: the runner verifies the provided +# input path exists and exits with a clear error message if not. +# - Runtime SystemDS error detection: test-run output is scanned for runtime +# error blocks (e.g., "An Error Occurred : ..."). Queries with runtime +# failures are reported as `status: "error"` and include `error_message` +# in generated JSON metadata for easier debugging and CI integration. +# +# MAJOR FEATURES IN v1.0 (First Public Release): +# - Complete SSB query execution with SystemDS 3.4.0-SNAPSHOT +# - Enhanced "see below" notation with result reprinting +# - Long table outputs displayed after summary for better UX +# - Eliminated need to scroll back through terminal output +# - Maintained array alignment for consistent result tracking +# - JSON metadata contains complete query results, not "see below" +# - Added --out-dir option for custom output directory +# - Multi-format output: TXT, CSV, JSON for each query result +# - Structured output directory with comprehensive run.json metadata file +# +# DEPENDENCIES: +# - SystemDS binary (3.4.0-SNAPSHOT or later) +# - Single-threaded configuration file (auto-generated) +# - SSB query files in scripts/ssb/queries/ +# - Bash 4.0+ with timeout support +# +# USAGE (from repo root): +# scripts/ssb/shell/run_ssb.sh # run all SSB queries +# scripts/ssb/shell/run_ssb.sh q1.1 q2.3 # run specific queries +# scripts/ssb/shell/run_ssb.sh --stats # enable internal statistics +# scripts/ssb/shell/run_ssb.sh q3.1 --stats # run specific query with stats +# scripts/ssb/shell/run_ssb.sh --seed=12345 # run with specific seed for reproducibility +# scripts/ssb/shell/run_ssb.sh --out-dir=/path # specify output directory for results +# +set -euo pipefail +export LC_ALL=C + +# Determine script directory and project root (repo root) +if command -v realpath >/dev/null 2>&1; then + SCRIPT_DIR="$(dirname "$(realpath "$0")")" +else + SCRIPT_DIR="$(python - <<'PY' +import os, sys +print(os.path.dirname(os.path.abspath(sys.argv[1]))) +PY +"$0")" +fi +if command -v git >/dev/null 2>&1 && git -C "$SCRIPT_DIR" rev-parse --show-toplevel >/dev/null 2>&1; then + PROJECT_ROOT="$(git -C "$SCRIPT_DIR" rev-parse --show-toplevel)" +else + __dir="$SCRIPT_DIR" + PROJECT_ROOT="" + while [[ "$__dir" != "/" ]]; do + if [[ -d "$__dir/.git" || -f "$__dir/pom.xml" ]]; then + PROJECT_ROOT="$__dir"; break + fi + __dir="$(dirname "$__dir")" + done + : "${PROJECT_ROOT:=$(cd "$SCRIPT_DIR/../../../" && pwd)}" +fi + +# Locate SystemDS executable +SYSTEMDS_CMD="$PROJECT_ROOT/bin/systemds" +if [[ ! -x "$SYSTEMDS_CMD" ]]; then + SYSTEMDS_CMD="$(command -v systemds || true)" +fi +if [[ -z "$SYSTEMDS_CMD" || ! -x "$SYSTEMDS_CMD" ]]; then + echo "Error: could not find SystemDS executable." >&2 + echo " Tried: $PROJECT_ROOT/bin/systemds and PATH" >&2 + exit 1 +fi + +# Ensure single-threaded configuration file exists +CONF_DIR="$PROJECT_ROOT/conf" +SINGLE_THREAD_CONF="$CONF_DIR/single_thread.xml" +mkdir -p "$CONF_DIR" +if [[ ! -f "$SINGLE_THREAD_CONF" ]]; then +cat > "$SINGLE_THREAD_CONF" <<'XML' + + + sysds.cp.parallel.opsfalse + + + sysds.num.threads1 + + +XML +fi +SYS_EXTRA_ARGS=( "-config" "$SINGLE_THREAD_CONF" ) + +# Query directory +QUERY_DIR="$PROJECT_ROOT/scripts/ssb/queries" + +# Verify query directory exists +if [[ ! -d "$QUERY_DIR" ]]; then + echo "Error: Query directory not found: $QUERY_DIR" >&2 + exit 1 +fi + +# Help function +show_help() { + cat << 'EOF' +SystemDS Star Schema Benchmark (SSB) Runner v1.0 + +USAGE (from repo root): + scripts/ssb/shell/run_ssb.sh [OPTIONS] [QUERIES...] + +OPTIONS: + --stats, -stats Enable SystemDS internal statistics collection + --seed=N, -seed=N Set random seed for reproducible results (default: auto-generated) + --output-dir=PATH, -output-dir=PATH Specify custom output directory (default: $PROJECT_ROOT/scripts/ssb/shell/ssbOutputData/QueryData) + --input-dir=PATH, -input-dir=PATH Specify custom data directory (default: $PROJECT_ROOT/data) + --help, -help, -h, --h Show this help message + --version, -version, -v, --v Show version information + +QUERIES: + If no queries are specified, all available SSB queries (q*.dml) will be executed. + To run specific queries, provide their names (with or without .dml extension): + ./run_ssb.sh q1.1 q2.3 q4.1 + +EXAMPLES (from repo root): + scripts/ssb/shell/run_ssb.sh # Run all SSB queries + scripts/ssb/shell/run_ssb.sh --stats # Run all queries with statistics + scripts/ssb/shell/run_ssb.sh -stats # Same as above (single dash) + scripts/ssb/shell/run_ssb.sh q1.1 q2.3 # Run specific queries only + scripts/ssb/shell/run_ssb.sh --seed=12345 --stats # Reproducible run with statistics + scripts/ssb/shell/run_ssb.sh -seed=12345 -stats # Same as above (single dash) + scripts/ssb/shell/run_ssb.sh --output-dir=/tmp/results # Custom output directory + scripts/ssb/shell/run_ssb.sh -output-dir=/tmp/results # Same as above (single dash) + scripts/ssb/shell/run_ssb.sh --input-dir=/path/to/data # Custom data directory + scripts/ssb/shell/run_ssb.sh -input-dir=/path/to/data # Same as above (single dash) + +OUTPUT: + Results are saved in multiple formats: + - TXT: Human-readable format + - CSV: Machine-readable data format + - JSON: Structured format with metadata + - run.json: Complete run metadata and results + +For more information, see the documentation in scripts/ssb/README.md +EOF +} + +# Parse arguments +RUN_STATS=false +QUERIES=() +SEED="" +OUT_DIR="" +INPUT_DIR="" +for arg in "$@"; do + if [[ "$arg" == "--help" || "$arg" == "-help" || "$arg" == "-h" || "$arg" == "--h" ]]; then + show_help + exit 0 + elif [[ "$arg" == "--version" || "$arg" == "-version" || "$arg" == "-v" || "$arg" == "--v" ]]; then + echo "SystemDS Star Schema Benchmark (SSB) Runner v1.0" + echo "First Public Release: September 5, 2025" + exit 0 + elif [[ "$arg" == "--stats" || "$arg" == "-stats" ]]; then + RUN_STATS=true + elif [[ "$arg" == --seed=* || "$arg" == -seed=* ]]; then + if [[ "$arg" == --seed=* ]]; then + SEED="${arg#--seed=}" + else + SEED="${arg#-seed=}" + fi + elif [[ "$arg" == "--seed" || "$arg" == "-seed" ]]; then + echo "Error: --seed/-seed requires a value (e.g., --seed=12345 or -seed=12345)" >&2 + exit 1 + elif [[ "$arg" == --output-dir=* || "$arg" == -output-dir=* ]]; then + if [[ "$arg" == --output-dir=* ]]; then + OUT_DIR="${arg#--output-dir=}" + else + OUT_DIR="${arg#-output-dir=}" + fi + elif [[ "$arg" == "--output-dir" || "$arg" == "-output-dir" ]]; then + echo "Error: --output-dir/-output-dir requires a value (e.g., --output-dir=/path/to/output or -output-dir=/path/to/output)" >&2 + exit 1 + elif [[ "$arg" == --input-dir=* || "$arg" == -input-dir=* ]]; then + if [[ "$arg" == --input-dir=* ]]; then + INPUT_DIR="${arg#--input-dir=}" + else + INPUT_DIR="${arg#-input-dir=}" + fi + elif [[ "$arg" == "--input-dir" || "$arg" == "-input-dir" ]]; then + echo "Error: --input-dir/-input-dir requires a value (e.g., --input-dir=/path/to/data or -input-dir=/path/to/data)" >&2 + exit 1 + else + # Check if argument looks like an unrecognized option (starts with dash) + if [[ "$arg" == -* ]]; then + echo "Error: Unrecognized option '$arg'" >&2 + echo "Use --help or -h to see available options." >&2 + exit 1 + else + # Treat as query name + name="$(echo "$arg" | tr '.' '_')" + QUERIES+=( "$name.dml" ) + fi + fi +done + +# Set default output directory if not provided +if [[ -z "$OUT_DIR" ]]; then + OUT_DIR="$PROJECT_ROOT/scripts/ssb/shell/ssbOutputData/QueryData" +fi + +# Set default input data directory if not provided +if [[ -z "$INPUT_DIR" ]]; then + INPUT_DIR="$PROJECT_ROOT/data" +fi + +# Normalize paths by removing trailing slashes +INPUT_DIR="${INPUT_DIR%/}" +OUT_DIR="${OUT_DIR%/}" + +# Ensure output directory exists +mkdir -p "$OUT_DIR" + +# Pass input directory to DML scripts via SystemDS named arguments +NVARGS=( -nvargs "input_dir=${INPUT_DIR}" ) + +# Validate input data directory exists +if [[ ! -d "$INPUT_DIR" ]]; then + echo "Error: Input data directory '$INPUT_DIR' does not exist." >&2 + echo "Please create the directory or specify a valid path with --input-dir=PATH" >&2 + exit 1 +fi + +# Generate seed if not provided +if [[ -z "$SEED" ]]; then + SEED=$((RANDOM * 32768 + RANDOM)) +fi + +# Discover queries if none provided +shopt -s nullglob +if [[ ${#QUERIES[@]} -eq 0 ]]; then + for f in "$QUERY_DIR"/q*.dml; do + if [[ -f "$f" ]]; then + QUERIES+=("$(basename "$f")") + fi + done + if [[ ${#QUERIES[@]} -eq 0 ]]; then + echo "Error: No query files found in $QUERY_DIR" >&2 + exit 1 + fi +fi +shopt -u nullglob + +# Metadata collection functions +collect_system_metadata() { + local timestamp hostname systemds_version jdk_version cpu_info ram_info + + # Basic system info + timestamp=$(date -u '+%Y-%m-%d %H:%M:%S UTC') + hostname=$(hostname 2>/dev/null || echo "unknown") + + # SystemDS version + if [[ -x "$SYSTEMDS_CMD" ]]; then + # Try to get version from pom.xml first + if [[ -f "$PROJECT_ROOT/pom.xml" ]]; then + systemds_version=$(grep -A1 'org.apache.systemds' "$PROJECT_ROOT/pom.xml" | grep '' | sed 's/.*\(.*\)<\/version>.*/\1/' | head -1 2>/dev/null || echo "unknown") + else + systemds_version="unknown" + fi + + # If pom.xml method failed, try alternative methods + if [[ "$systemds_version" == "unknown" ]]; then + # Try to extract from SystemDS JAR manifest + if [[ -f "$PROJECT_ROOT/target/systemds.jar" ]]; then + systemds_version=$(unzip -p "$PROJECT_ROOT/target/systemds.jar" META-INF/MANIFEST.MF 2>/dev/null | grep "Implementation-Version" | cut -d: -f2 | tr -d ' ' || echo "unknown") + else + # Try to find any SystemDS JAR and extract version + local jar_file=$(find "$PROJECT_ROOT" -name "systemds*.jar" | head -1 2>/dev/null) + if [[ -n "$jar_file" ]]; then + systemds_version=$(unzip -p "$jar_file" META-INF/MANIFEST.MF 2>/dev/null | grep "Implementation-Version" | cut -d: -f2 | tr -d ' ' || echo "unknown") + else + systemds_version="unknown" + fi + fi + fi + else + systemds_version="unknown" + fi + + # JDK version + if command -v java >/dev/null 2>&1; then + jdk_version=$(java -version 2>&1 | head -1 | sed 's/.*"\(.*\)".*/\1/' || echo "unknown") + else + jdk_version="unknown" + fi + + # System resources + if [[ "$(uname)" == "Darwin" ]]; then + # macOS + cpu_info=$(sysctl -n machdep.cpu.brand_string 2>/dev/null || echo "unknown") + ram_info=$(( $(sysctl -n hw.memsize 2>/dev/null || echo 0) / 1024 / 1024 / 1024 ))GB + else + # Linux + cpu_info=$(grep "model name" /proc/cpuinfo | head -1 | cut -d: -f2- | sed 's/^ *//' 2>/dev/null || echo "unknown") + ram_info=$(( $(grep MemTotal /proc/meminfo | awk '{print $2}' 2>/dev/null || echo 0) / 1024 / 1024 ))GB + fi + + # Store metadata globally + RUN_TIMESTAMP="$timestamp" + RUN_HOSTNAME="$hostname" + RUN_SYSTEMDS_VERSION="$systemds_version" + RUN_JDK_VERSION="$jdk_version" + RUN_CPU_INFO="$cpu_info" + RUN_RAM_INFO="$ram_info" +} + +collect_data_metadata() { + # Check for SSB data directory and get basic stats + local ssb_data_dir="$INPUT_DIR" + local json_parts=() + local display_parts=() + + if [[ -d "$ssb_data_dir" ]]; then + # Try to get row counts from data files (if they exist) + for table in customer part supplier date; do + local file="$ssb_data_dir/${table}.tbl" + if [[ -f "$file" ]]; then + local count=$(wc -l < "$file" 2>/dev/null | tr -d ' ' || echo "0") + json_parts+=(" \"$table\": \"$count\"") + display_parts+=("$table:$count") + fi + done + # Check for any lineorder*.tbl file (SSB fact table) + local lineorder_file=$(find "$ssb_data_dir" -name "lineorder*.tbl" -type f | head -1) + if [[ -n "$lineorder_file" && -f "$lineorder_file" ]]; then + local count=$(wc -l < "$lineorder_file" 2>/dev/null | tr -d ' ' || echo "0") + json_parts+=(" \"lineorder\": \"$count\"") + display_parts+=("lineorder:$count") + fi + fi + + if [[ ${#json_parts[@]} -eq 0 ]]; then + RUN_DATA_INFO='"No data files found"' + RUN_DATA_DISPLAY="No data files found" + else + # Join array elements with commas and newlines, wrap in braces for JSON + local formatted_json="{\n" + for i in "${!json_parts[@]}"; do + formatted_json+="${json_parts[$i]}" + if [[ $i -lt $((${#json_parts[@]} - 1)) ]]; then + formatted_json+=",\n" + else + formatted_json+="\n" + fi + done + formatted_json+=" }" + RUN_DATA_INFO="$formatted_json" + + # Join with spaces for display + local IFS=" " + RUN_DATA_DISPLAY="${display_parts[*]}" + fi +} + +# Output format functions +create_output_structure() { + local run_id="$1" + local base_dir="$OUT_DIR/ssb_run_$run_id" + + # Create output directory structure + mkdir -p "$base_dir"/{txt,csv,json} + + # Set global variables for output paths + OUTPUT_BASE_DIR="$base_dir" + OUTPUT_TXT_DIR="$base_dir/txt" + OUTPUT_CSV_DIR="$base_dir/csv" + OUTPUT_JSON_DIR="$base_dir/json" + OUTPUT_METADATA_FILE="$base_dir/run.json" +} + +save_query_result_txt() { + local query_name="$1" + local result_data="$2" + local output_file="$OUTPUT_TXT_DIR/${query_name}.txt" + + { + echo "=========================================" + echo "SSB Query: $query_name" + echo "=========================================" + echo "Timestamp: $(date -u '+%Y-%m-%d %H:%M:%S UTC')" + echo "Seed: $SEED" + echo "" + echo "Result:" + echo "---------" + echo "$result_data" + echo "" + echo "=========================================" + } > "$output_file" +} + +save_query_result_csv() { + local query_name="$1" + local result_data="$2" + local output_file="$OUTPUT_CSV_DIR/${query_name}.csv" + + # Check if result is a single scalar value (including negative numbers and scientific notation) + if [[ "$result_data" =~ ^-?[0-9]+(\.[0-9]+)?([eE][+-]?[0-9]+)?$ ]]; then + # Scalar result + { + echo "query,result" + echo "$query_name,$result_data" + } > "$output_file" + else + # Table result - try to convert to CSV format + { + echo "# SSB Query: $query_name" + echo "# Timestamp: $(date -u '+%Y-%m-%d %H:%M:%S UTC')" + echo "# Seed: $SEED" + # Convert space-separated table data to CSV + echo "$result_data" | sed 's/ */,/g' | sed 's/^,//g' | sed 's/,$//g' + } > "$output_file" + fi +} + +save_query_result_json() { + local query_name="$1" + local result_data="$2" + local output_file="$OUTPUT_JSON_DIR/${query_name}.json" + + # Escape quotes and special characters for JSON + local escaped_result=$(echo "$result_data" | sed 's/\\/\\\\/g' | sed 's/"/\\"/g' | tr '\n' ' ') + + { + echo "{" + echo " \"query\": \"$query_name\"," + echo " \"timestamp\": \"$(date -u '+%Y-%m-%d %H:%M:%S UTC')\"," + echo " \"seed\": $SEED," + echo " \"result\": \"$escaped_result\"," + echo " \"metadata\": {" + echo " \"systemds_version\": \"$RUN_SYSTEMDS_VERSION\"," + echo " \"hostname\": \"$RUN_HOSTNAME\"" + echo " }" + echo "}" + } > "$output_file" +} + +save_all_formats() { + local query_name="$1" + local result_data="$2" + + save_query_result_txt "$query_name" "$result_data" + save_query_result_csv "$query_name" "$result_data" + save_query_result_json "$query_name" "$result_data" +} + +# Collect metadata +collect_system_metadata +collect_data_metadata + +# Create output directory structure with timestamp-based run ID +RUN_ID="$(date +%Y%m%d_%H%M%S)" +create_output_structure "$RUN_ID" + +# Execute queries +count=0 +failed=0 +SUCCESSFUL_QUERIES=() # Array to track successfully executed queries +ALL_RUN_QUERIES=() # Array to track all queries that were attempted (in order) +QUERY_STATUS=() # Array to track status: "success" or "error" +QUERY_ERROR_MSG=() # Array to store error messages for failed queries +QUERY_RESULTS=() # Array to track query results for display +QUERY_FULL_RESULTS=() # Array to track complete query results for JSON +QUERY_STATS=() # Array to track SystemDS statistics for JSON +QUERY_TIMINGS=() # Array to track execution timing statistics +LONG_OUTPUTS=() # Array to store long table outputs for display after summary + +# Progress indicator function +progress_indicator() { + local query_name="$1" + local current="$2" + local total="$3" + echo -ne "\r[$current/$total] Running: $query_name " +} + +for q in "${QUERIES[@]}"; do + dml="$QUERY_DIR/$q" + if [[ ! -f "$dml" ]]; then + echo "Warning: query file '$dml' not found; skipping." >&2 + continue + fi + + # Show progress + progress_indicator "$q" "$((count + failed + 1))" "${#QUERIES[@]}" + + # Change to project root directory so relative paths in DML work correctly + cd "$PROJECT_ROOT" + + # Clear progress line before showing output + echo -ne "\r \r" + echo "[$((count + failed + 1))/${#QUERIES[@]}] Running: $q" + + # Record attempted query + ALL_RUN_QUERIES+=("$q") + + if $RUN_STATS; then + # Capture output to extract result + temp_output=$(mktemp) + if "$SYSTEMDS_CMD" "$dml" -stats "${SYS_EXTRA_ARGS[@]}" "${NVARGS[@]}" | tee "$temp_output"; then + # Even when SystemDS exits 0, the DML can emit runtime errors. Detect common error markers. + error_msg=$(sed -n '/An Error Occurred :/,$ p' "$temp_output" | sed -n '1,200p' | tr '\n' ' ' | sed 's/^ *//;s/ *$//') + if [[ -n "$error_msg" ]]; then + echo "Error: Query $q reported runtime error" >&2 + echo "$error_msg" >&2 + failed=$((failed+1)) + QUERY_STATUS+=("error") + QUERY_ERROR_MSG+=("$error_msg") + # Maintain array alignment + QUERY_STATS+=("") + QUERY_RESULTS+=("N/A") + QUERY_FULL_RESULTS+=("N/A") + LONG_OUTPUTS+=("") + else + count=$((count+1)) + SUCCESSFUL_QUERIES+=("$q") # Track successful query + QUERY_STATUS+=("success") + # Extract result - try multiple patterns with timeouts to prevent hanging: + # 1. Simple scalar pattern like "REVENUE: 687752409" + result=$(timeout 5s grep -E "^[A-Z_]+:\s*[0-9]+" "$temp_output" | tail -1 | awk '{print $2}' 2>/dev/null || true) + full_result="$result" # For scalar results, display and full results are the same + + # 2. If no scalar pattern, check for table output and get row count + if [[ -z "$result" ]]; then + # Look for frame info like "# FRAME: nrow = 53, ncol = 3" + nrows=$(timeout 5s grep "# FRAME: nrow =" "$temp_output" | awk '{print $5}' | tr -d ',' 2>/dev/null || true) + if [[ -n "$nrows" ]]; then + result="${nrows} rows (see below)" + # Extract and store the long output for later display (excluding statistics) + long_output=$(grep -v "^#" "$temp_output" | grep -v "WARNING" | grep -v "WARN" | grep -v "^$" | sed '/^SystemDS Statistics:/,$ d') + LONG_OUTPUTS+=("$long_output") + # For JSON, store the actual table data + full_result="$long_output" + else + # Count actual data rows (lines with numbers, excluding headers and comments) - limit to prevent hanging + nrows=$(timeout 5s grep -E "^[0-9]" "$temp_output" | sed '/^SystemDS Statistics:/,$ d' | head -1000 | wc -l | tr -d ' ' 2>/dev/null || echo "0") + if [[ "$nrows" -gt 0 ]]; then + result="${nrows} rows (see below)" + # Extract and store the long output for later display (excluding statistics) + long_output=$(grep -E "^[0-9]" "$temp_output" | sed '/^SystemDS Statistics:/,$ d' | head -1000) + LONG_OUTPUTS+=("$long_output") + # For JSON, store the actual table data + full_result="$long_output" + else + result="N/A" + full_result="N/A" + LONG_OUTPUTS+=("") # Empty placeholder to maintain array alignment + fi + fi + else + LONG_OUTPUTS+=("") # Empty placeholder for scalar results to maintain array alignment + fi + QUERY_RESULTS+=("$result") # Track query result for display + QUERY_FULL_RESULTS+=("$full_result") # Track complete query result for JSON + + # Save result in all formats + query_name_clean="${q%.dml}" + + # Extract and store statistics for JSON (preserving newlines) + stats_output=$(sed -n '/^SystemDS Statistics:/,$ p' "$temp_output") + QUERY_STATS+=("$stats_output") # Track statistics for JSON + + save_all_formats "$query_name_clean" "$full_result" + fi + else + echo "Error: Query $q failed" >&2 + failed=$((failed+1)) + QUERY_STATUS+=("error") + QUERY_ERROR_MSG+=("Query execution failed (non-zero exit)") + # Add empty stats entry for failed queries to maintain array alignment + QUERY_STATS+=("") + fi + rm -f "$temp_output" + else + # Capture output to extract result + temp_output=$(mktemp) + if "$SYSTEMDS_CMD" "$dml" "${SYS_EXTRA_ARGS[@]}" "${NVARGS[@]}" | tee "$temp_output"; then + # Detect runtime errors in output even if command returned 0 + error_msg=$(sed -n '/An Error Occurred :/,$ p' "$temp_output" | sed -n '1,200p' | tr '\n' ' ' | sed 's/^ *//;s/ *$//') + if [[ -n "$error_msg" ]]; then + echo "Error: Query $q reported runtime error" >&2 + echo "$error_msg" >&2 + failed=$((failed+1)) + QUERY_STATUS+=("error") + QUERY_ERROR_MSG+=("$error_msg") + QUERY_STATS+=("") + QUERY_RESULTS+=("N/A") + QUERY_FULL_RESULTS+=("N/A") + LONG_OUTPUTS+=("") + else + count=$((count+1)) + SUCCESSFUL_QUERIES+=("$q") # Track successful query + QUERY_STATUS+=("success") + # Extract result - try multiple patterns with timeouts to prevent hanging: + # 1. Simple scalar pattern like "REVENUE: 687752409" + result=$(timeout 5s grep -E "^[A-Z_]+:\s*[0-9]+" "$temp_output" | tail -1 | awk '{print $2}' 2>/dev/null || true) + full_result="$result" # For scalar results, display and full results are the same + + # 2. If no scalar pattern, check for table output and get row count + if [[ -z "$result" ]]; then + # Look for frame info like "# FRAME: nrow = 53, ncol = 3" + nrows=$(timeout 5s grep "# FRAME: nrow =" "$temp_output" | awk '{print $5}' | tr -d ',' 2>/dev/null || true) + if [[ -n "$nrows" ]]; then + result="${nrows} rows (see below)" + # Extract and store the long output for later display + long_output=$(grep -v "^#" "$temp_output" | grep -v "WARNING" | grep -v "WARN" | grep -v "^$" | tail -n +1) + LONG_OUTPUTS+=("$long_output") + # For JSON, store the actual table data + full_result="$long_output" + else + # Count actual data rows (lines with numbers, excluding headers and comments) - limit to prevent hanging + nrows=$(timeout 5s grep -E "^[0-9]" "$temp_output" | head -1000 | wc -l | tr -d ' ' 2>/dev/null || echo "0") + if [[ "$nrows" -gt 0 ]]; then + result="${nrows} rows (see below)" + # Extract and store the long output for later display + long_output=$(grep -E "^[0-9]" "$temp_output" | head -1000) + LONG_OUTPUTS+=("$long_output") + # For JSON, store the actual table data + full_result="$long_output" + else + result="N/A" + full_result="N/A" + LONG_OUTPUTS+=("") # Empty placeholder to maintain array alignment + fi + fi + else + LONG_OUTPUTS+=("") # Empty placeholder for scalar results to maintain array alignment + fi + QUERY_RESULTS+=("$result") # Track query result for display + QUERY_FULL_RESULTS+=("$full_result") # Track complete query result for JSON + + # Add empty stats entry for non-stats runs to maintain array alignment + QUERY_STATS+=("") + + # Save result in all formats + query_name_clean="${q%.dml}" + save_all_formats "$query_name_clean" "$full_result" + fi + else + echo "Error: Query $q failed" >&2 + failed=$((failed+1)) + QUERY_STATUS+=("error") + QUERY_ERROR_MSG+=("Query execution failed (non-zero exit)") + # Add empty stats entry for failed queries to maintain array alignment + QUERY_STATS+=("") + fi + rm -f "$temp_output" + fi +done + +# Summary +echo "" +echo "=========================================" +echo "SSB benchmark completed!" +echo "Total queries executed: $count" +if [[ $failed -gt 0 ]]; then + echo "Failed queries: $failed" +fi +if $RUN_STATS; then + echo "Statistics: enabled" +else + echo "Statistics: disabled" +fi + +# Display run metadata summary +echo "" +echo "=========================================" +echo "RUN METADATA SUMMARY" +echo "=========================================" +echo "Timestamp: $RUN_TIMESTAMP" +echo "Hostname: $RUN_HOSTNAME" +echo "Seed: $SEED" +echo "" +echo "Software Versions:" +echo " SystemDS: $RUN_SYSTEMDS_VERSION" +echo " JDK: $RUN_JDK_VERSION" +echo "" +echo "System Resources:" +echo " CPU: $RUN_CPU_INFO" +echo " RAM: $RUN_RAM_INFO" +echo "" +echo "Data Build Info:" +echo " SSB Data: $RUN_DATA_DISPLAY" +echo "=========================================" + +# Generate metadata JSON file (include all attempted queries with status and error messages) +{ + echo "{" + echo " \"benchmark_type\": \"ssb_systemds\"," + echo " \"timestamp\": \"$RUN_TIMESTAMP\"," + echo " \"hostname\": \"$RUN_HOSTNAME\"," + echo " \"seed\": $SEED," + echo " \"software_versions\": {" + echo " \"systemds\": \"$RUN_SYSTEMDS_VERSION\"," + echo " \"jdk\": \"$RUN_JDK_VERSION\"" + echo " }," + echo " \"system_resources\": {" + echo " \"cpu\": \"$RUN_CPU_INFO\"," + echo " \"ram\": \"$RUN_RAM_INFO\"" + echo " }," + echo -e " \"data_build_info\": $RUN_DATA_INFO," + echo " \"run_configuration\": {" + echo " \"statistics_enabled\": $(if $RUN_STATS; then echo "true"; else echo "false"; fi)," + echo " \"queries_selected\": ${#QUERIES[@]}," + echo " \"queries_executed\": $count," + echo " \"queries_failed\": $failed" + echo " }," + echo " \"results\": [" + for i in "${!ALL_RUN_QUERIES[@]}"; do + query="${ALL_RUN_QUERIES[$i]}" + status="${QUERY_STATUS[$i]:-error}" + error_msg="${QUERY_ERROR_MSG[$i]:-}" + # Find matching full_result and stats by searching SUCCESSFUL_QUERIES index + full_result="" + stats_result="" + if [[ "$status" == "success" ]]; then + # Find index in SUCCESSFUL_QUERIES + for j in "${!SUCCESSFUL_QUERIES[@]}"; do + if [[ "${SUCCESSFUL_QUERIES[$j]}" == "$query" ]]; then + full_result="${QUERY_FULL_RESULTS[$j]}" + stats_result="${QUERY_STATS[$j]}" + break + fi + done + fi + # Escape quotes and newlines for JSON + escaped_result=$(echo "$full_result" | sed 's/\\/\\\\/g' | sed 's/"/\\"/g' | tr '\n' ' ') + escaped_error=$(echo "$error_msg" | sed 's/\\/\\\\/g' | sed 's/"/\\"/g' | tr '\n' ' ') + + echo " {" + echo " \"query\": \"${query%.dml}\"," + echo " \"status\": \"$status\"," + echo " \"error_message\": \"$escaped_error\"," + echo " \"result\": \"$escaped_result\"" + if [[ -n "$stats_result" ]]; then + echo " ,\"stats\": [" + echo "$stats_result" | sed 's/\\/\\\\/g' | sed 's/"/\\"/g' | sed 's/\t/ /g' | awk ' + BEGIN { first = 1 } + { + if (!first) printf ",\n" + printf " \"%s\"", $0 + first = 0 + } + END { if (!first) printf "\n" } + ' + echo " ]" + fi + if [[ $i -lt $((${#ALL_RUN_QUERIES[@]} - 1)) ]]; then + echo " }," + else + echo " }" + fi + done + echo " ]" + echo "}" +} > "$OUTPUT_METADATA_FILE" + +echo "" +echo "Metadata saved to $OUTPUT_METADATA_FILE" +echo "Output directory: $OUTPUT_BASE_DIR" +echo " - TXT files: $OUTPUT_TXT_DIR" +echo " - CSV files: $OUTPUT_CSV_DIR" +echo " - JSON files: $OUTPUT_JSON_DIR" + +# Detailed per-query summary (show status and error messages if any) +if [[ ${#ALL_RUN_QUERIES[@]} -gt 0 ]]; then + echo "" + echo "===================================================" + echo "QUERIES SUMMARY" + echo "===================================================" + printf "%-4s %-15s %-30s %s\n" "No." "Query" "Result" "Status" + echo "---------------------------------------------------" + for i in "${!ALL_RUN_QUERIES[@]}"; do + query="${ALL_RUN_QUERIES[$i]}" + query_display="${query%.dml}" # Remove .dml extension for display + status="${QUERY_STATUS[$i]:-error}" + if [[ "$status" == "success" ]]; then + # Find index in SUCCESSFUL_QUERIES to fetch result + result="" + for j in "${!SUCCESSFUL_QUERIES[@]}"; do + if [[ "${SUCCESSFUL_QUERIES[$j]}" == "$query" ]]; then + result="${QUERY_RESULTS[$j]}" + break + fi + done + printf "%-4d %-15s %-30s %s\n" "$((i+1))" "$query_display" "$result" "✓ Success" + else + err="${QUERY_ERROR_MSG[$i]:-Unknown error}" + printf "%-4d %-15s %-30s %s\n" "$((i+1))" "$query_display" "N/A" "ERROR: ${err}" + fi + done +echo "===================================================" +fi + +# Display long outputs for queries that had table results +if [[ ${#SUCCESSFUL_QUERIES[@]} -gt 0 ]]; then + # Check if we have any long outputs to display + has_long_outputs=false + for i in "${!LONG_OUTPUTS[@]}"; do + if [[ -n "${LONG_OUTPUTS[$i]}" ]]; then + has_long_outputs=true + break + fi + done + + if $has_long_outputs; then + echo "" + echo "=========================================" + echo "DETAILED QUERY RESULTS" + echo "=========================================" + for i in "${!SUCCESSFUL_QUERIES[@]}"; do + if [[ -n "${LONG_OUTPUTS[$i]}" ]]; then + query="${SUCCESSFUL_QUERIES[$i]}" + query_display="${query%.dml}" # Remove .dml extension for display + echo "" + echo "[$((i+1))] Results for $query_display:" + echo "----------------------------------------" + echo "${LONG_OUTPUTS[$i]}" + echo "----------------------------------------" + fi + done + echo "=========================================" + fi +fi + +# Exit with appropriate code +if [[ $failed -gt 0 ]]; then + exit 1 +fi diff --git a/scripts/ssb/sql/q1.1.sql b/scripts/ssb/sql/q1.1.sql new file mode 100644 index 00000000000..02e3844d12c --- /dev/null +++ b/scripts/ssb/sql/q1.1.sql @@ -0,0 +1,7 @@ +SELECT SUM(lo_extendedprice * lo_discount) AS REVENUE +FROM lineorder, dates +WHERE + lo_orderdate = d_datekey + AND d_year = 1993 + AND lo_discount BETWEEN 1 AND 3 + AND lo_quantity < 25; \ No newline at end of file diff --git a/scripts/ssb/sql/q1.2.sql b/scripts/ssb/sql/q1.2.sql new file mode 100644 index 00000000000..834d73f623f --- /dev/null +++ b/scripts/ssb/sql/q1.2.sql @@ -0,0 +1,7 @@ +SELECT SUM(lo_extendedprice * lo_discount) AS REVENUE +FROM lineorder, dates +WHERE + lo_orderdate = d_datekey + AND d_yearmonth = 'Jan1994' + AND lo_discount BETWEEN 4 AND 6 + AND lo_quantity BETWEEN 26 AND 35; \ No newline at end of file diff --git a/scripts/ssb/sql/q1.3.sql b/scripts/ssb/sql/q1.3.sql new file mode 100644 index 00000000000..7a09490b840 --- /dev/null +++ b/scripts/ssb/sql/q1.3.sql @@ -0,0 +1,9 @@ +SELECT + SUM(lo_extendedprice * lo_discount) AS REVENUE +FROM lineorder, dates +WHERE + lo_orderdate = d_datekey + AND d_weeknuminyear = 6 + AND d_year = 1994 + AND lo_discount BETWEEN 5 AND 7 + AND lo_quantity BETWEEN 26 AND 35; \ No newline at end of file diff --git a/scripts/ssb/sql/q2.1.sql b/scripts/ssb/sql/q2.1.sql new file mode 100644 index 00000000000..f455ff9e935 --- /dev/null +++ b/scripts/ssb/sql/q2.1.sql @@ -0,0 +1,10 @@ +SELECT SUM(lo_revenue), d_year, p_brand +FROM lineorder, dates, part, supplier +WHERE + lo_orderdate = d_datekey + AND lo_partkey = p_partkey + AND lo_suppkey = s_suppkey + AND p_category = 'MFGR#12' + AND s_region = 'AMERICA' +GROUP BY d_year, p_brand +ORDER BY p_brand; \ No newline at end of file diff --git a/scripts/ssb/sql/q2.2.sql b/scripts/ssb/sql/q2.2.sql new file mode 100644 index 00000000000..e28d55153c2 --- /dev/null +++ b/scripts/ssb/sql/q2.2.sql @@ -0,0 +1,10 @@ +SELECT SUM(lo_revenue), d_year, p_brand +FROM lineorder, dates, part, supplier +WHERE + lo_orderdate = d_datekey + AND lo_partkey = p_partkey + AND lo_suppkey = s_suppkey + AND p_brand BETWEEN 'MFGR#2221' AND 'MFGR#2228' + AND s_region = 'ASIA' +GROUP BY d_year, p_brand +ORDER BY d_year, p_brand; \ No newline at end of file diff --git a/scripts/ssb/sql/q2.3.sql b/scripts/ssb/sql/q2.3.sql new file mode 100644 index 00000000000..8ec135cef0a --- /dev/null +++ b/scripts/ssb/sql/q2.3.sql @@ -0,0 +1,10 @@ +SELECT SUM(lo_revenue), d_year, p_brand +FROM lineorder, dates, part, supplier +WHERE + lo_orderdate = d_datekey + AND lo_partkey = p_partkey + AND lo_suppkey = s_suppkey + AND p_brand = 'MFGR#2239' + AND s_region = 'EUROPE' +GROUP BY d_year, p_brand +ORDER BY d_year, p_brand; \ No newline at end of file diff --git a/scripts/ssb/sql/q3.1.sql b/scripts/ssb/sql/q3.1.sql new file mode 100644 index 00000000000..badd93f973a --- /dev/null +++ b/scripts/ssb/sql/q3.1.sql @@ -0,0 +1,16 @@ +SELECT + c_nation, + s_nation, + d_year, + SUM(lo_revenue) AS REVENUE +FROM customer, lineorder, supplier, dates +WHERE + lo_custkey = c_custkey + AND lo_suppkey = s_suppkey + AND lo_orderdate = d_datekey + AND c_region = 'ASIA' + AND s_region = 'ASIA' + AND d_year >= 1992 + AND d_year <= 1997 +GROUP BY c_nation, s_nation, d_year +ORDER BY d_year ASC, REVENUE DESC; \ No newline at end of file diff --git a/scripts/ssb/sql/q3.2.sql b/scripts/ssb/sql/q3.2.sql new file mode 100644 index 00000000000..fc5564d3b6e --- /dev/null +++ b/scripts/ssb/sql/q3.2.sql @@ -0,0 +1,16 @@ +SELECT + c_city, + s_city, + d_year, + SUM(lo_revenue) AS REVENUE +FROM customer, lineorder, supplier, dates +WHERE + lo_custkey = c_custkey + AND lo_suppkey = s_suppkey + AND lo_orderdate = d_datekey + AND c_nation = 'UNITED STATES' + AND s_nation = 'UNITED STATES' + AND d_year >= 1992 + AND d_year <= 1997 +GROUP BY c_city, s_city, d_year +ORDER BY d_year ASC, REVENUE DESC; \ No newline at end of file diff --git a/scripts/ssb/sql/q3.3.sql b/scripts/ssb/sql/q3.3.sql new file mode 100644 index 00000000000..5fdfdf39eae --- /dev/null +++ b/scripts/ssb/sql/q3.3.sql @@ -0,0 +1,22 @@ +SELECT + c_city, + s_city, + d_year, + SUM(lo_revenue) AS REVENUE +FROM customer, lineorder, supplier, dates +WHERE + lo_custkey = c_custkey + AND lo_suppkey = s_suppkey + AND lo_orderdate = d_datekey + AND ( + c_city = 'UNITED KI1' + OR c_city = 'UNITED KI5' + ) + AND ( + s_city = 'UNITED KI1' + OR s_city = 'UNITED KI5' + ) + AND d_year >= 1992 + AND d_year <= 1997 +GROUP BY c_city, s_city, d_year +ORDER BY d_year ASC, REVENUE DESC; \ No newline at end of file diff --git a/scripts/ssb/sql/q3.4.sql b/scripts/ssb/sql/q3.4.sql new file mode 100644 index 00000000000..a94a81795f5 --- /dev/null +++ b/scripts/ssb/sql/q3.4.sql @@ -0,0 +1,21 @@ +SELECT + c_city, + s_city, + d_year, + SUM(lo_revenue) AS REVENUE +FROM customer, lineorder, supplier, dates +WHERE + lo_custkey = c_custkey + AND lo_suppkey = s_suppkey + AND lo_orderdate = d_datekey + AND ( + c_city = 'UNITED KI1' + OR c_city = 'UNITED KI5' + ) + AND ( + s_city = 'UNITED KI1' + OR s_city = 'UNITED KI5' + ) + AND d_yearmonth = 'Dec1997' +GROUP BY c_city, s_city, d_year +ORDER BY d_year ASC, REVENUE DESC; \ No newline at end of file diff --git a/scripts/ssb/sql/q4.1.sql b/scripts/ssb/sql/q4.1.sql new file mode 100644 index 00000000000..a7d48bfe436 --- /dev/null +++ b/scripts/ssb/sql/q4.1.sql @@ -0,0 +1,18 @@ +SELECT + d_year, + c_nation, + SUM(lo_revenue - lo_supplycost) AS PROFIT +FROM dates, customer, supplier, part, lineorder +WHERE + lo_custkey = c_custkey + AND lo_suppkey = s_suppkey + AND lo_partkey = p_partkey + AND lo_orderdate = d_datekey + AND c_region = 'AMERICA' + AND s_region = 'AMERICA' + AND ( + p_mfgr = 'MFGR#1' + OR p_mfgr = 'MFGR#2' + ) +GROUP BY d_year, c_nation +ORDER BY d_year, c_nation; diff --git a/scripts/ssb/sql/q4.2.sql b/scripts/ssb/sql/q4.2.sql new file mode 100644 index 00000000000..1c68951d58d --- /dev/null +++ b/scripts/ssb/sql/q4.2.sql @@ -0,0 +1,23 @@ +SELECT + d_year, + s_nation, + p_category, + SUM(lo_revenue - lo_supplycost) AS PROFIT +FROM dates, customer, supplier, part, lineorder +WHERE + lo_custkey = c_custkey + AND lo_suppkey = s_suppkey + AND lo_partkey = p_partkey + AND lo_orderdate = d_datekey + AND c_region = 'AMERICA' + AND s_region = 'AMERICA' + AND ( + d_year = 1997 + OR d_year = 1998 + ) + AND ( + p_mfgr = 'MFGR#1' + OR p_mfgr = 'MFGR#2' + ) +GROUP BY d_year, s_nation, p_category +ORDER BY d_year, s_nation, p_category; \ No newline at end of file diff --git a/scripts/ssb/sql/q4.3.sql b/scripts/ssb/sql/q4.3.sql new file mode 100644 index 00000000000..815ab2d8a56 --- /dev/null +++ b/scripts/ssb/sql/q4.3.sql @@ -0,0 +1,19 @@ +SELECT + d_year, + s_city, + p_brand, + SUM(lo_revenue - lo_supplycost) AS PROFIT +FROM dates, customer, supplier, part, lineorder +WHERE + lo_custkey = c_custkey + AND lo_suppkey = s_suppkey + AND lo_partkey = p_partkey + AND lo_orderdate = d_datekey + AND s_nation = 'UNITED STATES' + AND ( + d_year = 1997 + OR d_year = 1998 + ) + AND p_category = 'MFGR#14' +GROUP BY d_year, s_city, p_brand +ORDER BY d_year, s_city, p_brand; \ No newline at end of file diff --git a/spark_config.xml b/spark_config.xml new file mode 100644 index 00000000000..8db991ba42d --- /dev/null +++ b/spark_config.xml @@ -0,0 +1,12 @@ + + + + + SSB_Q1_1_Test + local[*] + 4g + 4g + 2g + + + \ No newline at end of file diff --git a/src/main/java/org/apache/sysds/hops/BinaryOp.java b/src/main/java/org/apache/sysds/hops/BinaryOp.java index 2b803a053c1..4dd5e1f243d 100644 --- a/src/main/java/org/apache/sysds/hops/BinaryOp.java +++ b/src/main/java/org/apache/sysds/hops/BinaryOp.java @@ -854,6 +854,9 @@ else if( (op == OpOp2.CBIND && getDataType().isList()) _etype = ExecType.CP; } + if( _etype == ExecType.OOC ) //TODO + setExecType(ExecType.CP); + //mark for recompile (forever) setRequiresRecompileIfNecessary(); diff --git a/src/main/java/org/apache/sysds/runtime/controlprogram/caching/CacheableData.java b/src/main/java/org/apache/sysds/runtime/controlprogram/caching/CacheableData.java index 34a8aa18631..67f9f698a97 100644 --- a/src/main/java/org/apache/sysds/runtime/controlprogram/caching/CacheableData.java +++ b/src/main/java/org/apache/sysds/runtime/controlprogram/caching/CacheableData.java @@ -73,8 +73,8 @@ /** * Each object of this class is a cache envelope for some large piece of data - * called "cache block". For example, the body of a matrix can be the cache block. - * The term cache block refers strictly to the cacheable portion of the data object, + * called "cache block". For example, the body of a matrix can be the cache block. + * The term cache block refers strictly to the cacheable portion of the data object, * often excluding metadata and auxiliary parameters, as defined in the subclasses. * Under the protection of the envelope, the data blob may be evicted to * the file system; then the subclass must set its reference to null @@ -96,43 +96,43 @@ public abstract class CacheableData> extends Data public static final String CACHING_EVICTION_FILEEXTENSION = ".dat"; public static final boolean CACHING_ASYNC_FILECLEANUP = true; public static boolean CACHING_ASYNC_SERIALIZE = false; - + //NOTE CACHING_ASYNC_SERIALIZE: - // The serialization of matrices and frames (ultra-sparse matrices or - // frames with strings) into buffer pool byte arrays happens outside the + // The serialization of matrices and frames (ultra-sparse matrices or + // frames with strings) into buffer pool byte arrays happens outside the // critical region of the global lock in LazyWriteBuffer. However, it still - // requires thread-local serialization (before returning from release) in - // order to guarantee that not too many objects are pinned at the same time - // which would violate the memory budget. Therefore, the new asynchronous + // requires thread-local serialization (before returning from release) in + // order to guarantee that not too many objects are pinned at the same time + // which would violate the memory budget. Therefore, the new asynchronous // serialization (see CACHING_ASYNC_SERIALIZE) should be understood as // optimistic with weaker guarantees. - + /** * Defines all possible cache status types for a data blob. * An object of class {@link CacheableData} can be in one of the following * five status types: * - * EMPTY: Either there is no data blob at all, or the data blob + * EMPTY: Either there is no data blob at all, or the data blob * resides in a specified import file and has never been downloaded yet. * READ: The data blob is in main memory; one or more threads are * referencing and reading it (shared "read-only" lock). This status uses a * counter. Eviction is NOT allowed. * MODIFY: The data blob is in main memory; exactly one thread is * referencing and modifying it (exclusive "write" lock). Eviction is NOT allowed. - * CACHED: The data blob is in main memory, and nobody is using nor referencing it. + * CACHED: The data blob is in main memory, and nobody is using nor referencing it. * There is always an persistent recovery object for it **/ public enum CacheStatus { - EMPTY, - READ, - MODIFY, + EMPTY, + READ, + MODIFY, CACHED, CACHED_NOWRITE, } - + /** Global flag indicating if caching is enabled (controls eviction) */ private static volatile boolean _activeFlag = false; - + /** Global sequence for generating unique ids. */ private static IDSequence _seq = null; @@ -147,9 +147,9 @@ public enum CacheStatus { @Override protected Long initialValue() { return 0L; } }; - //current size of live broadcast objects (because Spark's ContextCleaner maintains - //a buffer with references to prevent eager cleanup by GC); note that this is an - //overestimate, because we maintain partitioned broadcasts as soft references, which + //current size of live broadcast objects (because Spark's ContextCleaner maintains + //a buffer with references to prevent eager cleanup by GC); note that this is an + //overestimate, because we maintain partitioned broadcasts as soft references, which //might be collected by the GC and subsequently cleaned up by Spark's ContextCleaner. private static final AtomicLong _refBCs = new AtomicLong(0); @@ -159,16 +159,16 @@ public enum CacheStatus { /** * The unique (JVM-wide) ID of a cacheable data object; to ensure unique IDs across JVMs, we - * concatenate filenames with a unique prefix (map task ID). + * concatenate filenames with a unique prefix (map task ID). */ private final long _uniqueID; - + /** The cache status of the data blob (whether it can be or is evicted, etc. */ private CacheStatus _cacheStatus = null; - + /** Cache for actual data, evicted by garbage collector. */ protected SoftReference _cache = null; - + /** Container object that holds the actual data. */ protected T _data = null; @@ -177,47 +177,47 @@ public enum CacheStatus { * includes: 1) Matrix dimensions, if available 2) Number of non-zeros, if * available 3) Block dimensions, if applicable 4) InputInfo -- subsequent * operations that use this Matrix expect it to be in this format. - * + * * When the matrix is written to HDFS (local file system, as well?), one * must get the OutputInfo that matches with InputInfo stored inside _mtd. */ protected MetaData _metaData = null; - + protected FederationMap _fedMapping = null; protected boolean _compressed = false; protected long _compressedSize = -1; - + /** The name of HDFS file in which the data is backed up. */ protected String _hdfsFileName = null; // file name and path protected boolean _isPRead = false; //persistent read, must not be deleted - - /** - * Flag that indicates whether or not hdfs file exists.It is used - * for improving the performance of "rmvar" instruction. When it has - * value false, one can skip file system existence + + /** + * Flag that indicates whether or not hdfs file exists.It is used + * for improving the performance of "rmvar" instruction. When it has + * value false, one can skip file system existence * checks which can be expensive. */ - private boolean _hdfsFileExists = false; + private boolean _hdfsFileExists = false; /** Information relevant to specific external file formats. */ private FileFormatProperties _formatProps = null; - + /** * true if the in-memory or evicted matrix may be different from * the matrix located at {@link #_hdfsFileName}; false if the two * matrices should be the same. */ private boolean _dirtyFlag = false; - + // additional private flags and meta data private int _numReadThreads = 0; //number of threads for read from HDFS - private boolean _cleanupFlag = true; //flag if obj unpinned (cleanup enabled) + private boolean _cleanupFlag = true; //flag if obj unpinned (cleanup enabled) private String _cacheFileName = null; //local eviction file name private boolean _requiresLocalWrite = false; //flag if local write for read obj - private boolean _isAcquireFromEmpty = false; //flag if read from status empty - + private boolean _isAcquireFromEmpty = false; //flag if read from status empty + //backend-specific handles //note: we use the abstraction of LineageObjects for two reasons: (1) to keep track of cleanup //for lazily evaluated RDDs, and (2) as abstraction for environments that do not necessarily have spark libraries available @@ -225,13 +225,13 @@ public enum CacheStatus { private BroadcastObject _bcHandle = null; //Broadcast handle protected HashMap _gpuObjects = null; //Per GPUContext object allocated on GPU //TODO generalize for frames - private OOCStreamable _streamHandle = null; - + private LocalTaskQueue _streamHandle = null; + private LineageItem _lineage = null; - + /** * Basic constructor for any cacheable data. - * + * * @param dt data type * @param vt value type */ @@ -242,28 +242,28 @@ protected CacheableData(DataType dt, ValueType vt) { _numReadThreads = 0; _gpuObjects = DMLScript.USE_ACCELERATOR ? new HashMap<>() : null; } - + /** * Copy constructor for cacheable data (of same type). - * + * * @param that cacheable data object */ protected CacheableData(CacheableData that) { this( that.getDataType(), that.getValueType() ); _cleanupFlag = that._cleanupFlag; _hdfsFileName = that._hdfsFileName; - _hdfsFileExists = that._hdfsFileExists; + _hdfsFileExists = that._hdfsFileExists; _gpuObjects = that._gpuObjects; _dirtyFlag = that._dirtyFlag; _compressed = that._compressed; _compressedSize = that._compressedSize; _fedMapping = that._fedMapping; } - + /** - * Enables or disables the cleanup of the associated + * Enables or disables the cleanup of the associated * data object on clearData(). - * + * * @param flag true if cleanup */ public void enableCleanup(boolean flag) { @@ -271,15 +271,15 @@ public void enableCleanup(boolean flag) { } /** - * Indicates if cleanup of the associated data object + * Indicates if cleanup of the associated data object * is enabled on clearData(). - * + * * @return true if cleanup enabled */ public boolean isCleanupEnabled() { return _cleanupFlag; } - + public CacheStatus getStatus() { return _cacheStatus; } @@ -295,15 +295,15 @@ public void setHDFSFileExists( boolean flag ) { public String getFileName() { return _hdfsFileName; } - + public boolean isPersistentRead() { return _isPRead; } - + public void setPersistentRead(boolean pread) { _isPRead = pread; } - + public long getUniqueID() { return _uniqueID; } @@ -314,12 +314,12 @@ public synchronized void setFileName( String file ) { _dirtyFlag = true; _hdfsFileName = file; } - + /** * true if the in-memory or evicted matrix may be different from * the matrix located at {@link #_hdfsFileName}; false if the two * matrices are supposed to be the same. - * + * * @return true if dirty */ public boolean isDirty() { @@ -337,7 +337,7 @@ public FileFormatProperties getFileFormatProperties() { public void setFileFormatProperties(FileFormatProperties props) { _formatProps = props; } - + @Override public void setMetaData(MetaData md) { _metaData = md; @@ -351,7 +351,7 @@ public void setCompressedSize(long size){ public boolean isCompressed(){ return _compressed; } - + public long getCompressedSize(){ return _compressedSize; } @@ -365,7 +365,7 @@ public MetaData getMetaData() { public void removeMetaData() { _metaData = null; } - + public DataCharacteristics getDataCharacteristics() { return _metaData.getDataCharacteristics(); } @@ -381,11 +381,11 @@ public long getNumRows() { public long getNumColumns() { return getDataCharacteristics().getCols(); } - + public int getBlocksize() { return getDataCharacteristics().getBlocksize(); } - + public abstract void refreshMetaData(); public LineageItem getCacheLineage() { @@ -419,15 +419,15 @@ public boolean isFederated() { } return _fedMapping != null; } - + public boolean isFederated(FType type) { return isFederated() && (type == null || _fedMapping.getType().isType(type)); } - + public boolean isFederatedExcept(FType type) { return isFederated() && !isFederated(type); } - + /** * Gets the mapping of indices ranges to federated objects. * @return fedMapping mapping @@ -435,7 +435,7 @@ public boolean isFederatedExcept(FType type) { public FederationMap getFedMapping() { return _fedMapping; } - + /** * Sets the mapping of indices ranges to federated objects. * @param fedMapping mapping @@ -443,7 +443,7 @@ public FederationMap getFedMapping() { public void setFedMapping(FederationMap fedMapping) { _fedMapping = fedMapping; } - + public RDDObject getRDDHandle() { return _rddHandle; } @@ -452,7 +452,7 @@ public void setRDDHandle( RDDObject rdd ) { //cleanup potential old back reference if( _rddHandle != null ) _rddHandle.setBackReference(null); - + //add new rdd handle _rddHandle = rdd; if( _rddHandle != null ) @@ -462,7 +462,7 @@ public void setRDDHandle( RDDObject rdd ) { public boolean hasRDDHandle() { return _rddHandle != null && _rddHandle.hasBackReference(); } - + public BroadcastObject getBroadcastHandle() { return _bcHandle; } @@ -470,44 +470,17 @@ public BroadcastObject getBroadcastHandle() { public boolean hasBroadcastHandle() { return _bcHandle != null && _bcHandle.hasBackReference(); } - - public OOCStream getStreamHandle() { - if( !hasStreamHandle() ) { - final SubscribableTaskQueue _mStream = new SubscribableTaskQueue<>(); - _streamHandle = _mStream; - DataCharacteristics dc = getDataCharacteristics(); - MatrixBlock src = (MatrixBlock)acquireReadAndRelease(); - LongStream.range(0, dc.getNumBlocks()) - .mapToObj(i -> UtilFunctions.createIndexedMatrixBlock(src, dc, i)) - .forEach( blk -> { - try{ - _mStream.enqueue(blk); - } - catch(Exception ex) { - throw ex instanceof DMLRuntimeException ? (DMLRuntimeException) ex : new DMLRuntimeException(ex); - }}); - _mStream.closeInput(); - } - - return _streamHandle.getReadStream(); + + public LocalTaskQueue getStreamHandle() { + return _streamHandle; } - - /** - * Probes if stream handle is existing, because getStreamHandle - * creates a new stream if not existing. - * - * @return true if existing, false otherwise - */ - public boolean hasStreamHandle() { - return _streamHandle != null && !_streamHandle.isProcessed(); - } @SuppressWarnings({ "rawtypes", "unchecked" }) public void setBroadcastHandle( BroadcastObject bc ) { //cleanup potential old back reference if( _bcHandle != null ) _bcHandle.setBackReference(null); - + //add new broadcast handle _bcHandle = bc; if( _bcHandle != null ) @@ -527,15 +500,15 @@ public synchronized void setGPUObject(GPUContext gCtx, GPUObject gObj) { if (old != null) throw new DMLRuntimeException("GPU : Inconsistent internal state - this CacheableData already has a GPUObject assigned to the current GPUContext (" + gCtx + ")"); } - + public synchronized void removeGPUObject(GPUContext gCtx) { _gpuObjects.remove(gCtx); } - public synchronized void setStreamHandle(OOCStreamable q) { + public synchronized void setStreamHandle(LocalTaskQueue q) { _streamHandle = q; } - + // ********************************************* // *** *** // *** HIGH-LEVEL METHODS THAT SPECIFY *** @@ -548,38 +521,38 @@ public T acquireReadAndRelease() { release(); return tmp; } - + /** * Acquires a shared "read-only" lock, produces the reference to the cache block, * restores the cache block to main memory, reads from HDFS if needed. - * + * * Synchronized because there might be parallel threads (parfor local) that * access the same object (in case it was created before the loop). - * + * * In-Status: EMPTY, EVICTABLE, EVICTED, READ; * Out-Status: READ(+1). - * + * * @return cacheable data */ public T acquireRead() { long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0; - + //core internal acquire (synchronized per object) T ret = acquireReadIntern(); - + //update thread-local status (after pin but outside the //critical section of accessing a shared object) if( !isBelowCachingThreshold() ) updateStatusPinned(true); - + if( DMLScript.STATISTICS ){ long t1 = System.nanoTime(); CacheStatistics.incrementAcquireRTime(t1-t0); } - + return ret; } - + private synchronized T acquireReadIntern() { if ( !isAvailableToRead() ) throw new DMLRuntimeException("MatrixObject not available to read."); @@ -591,7 +564,7 @@ private synchronized T acquireReadIntern() { if (OptimizerUtils.isUMMEnabled()) //track and make space in the UMM UnifiedMemoryManager.pin(this); - + //call acquireHostRead if gpuHandle is set as well as is allocated if( DMLScript.USE_ACCELERATOR && _gpuObjects != null ) { boolean copiedFromGPU = false; @@ -606,7 +579,7 @@ else if (gObj != null) { } } } - + //read data from HDFS/RDD if required //(probe data for cache_nowrite / jvm_reuse) if( _data==null && ( isEmpty(true) || hasValidLineage() )) { @@ -625,20 +598,20 @@ && getRDDHandle() == null) ) { //mark for initial local write despite read operation _requiresLocalWrite = false; } - else if( hasStreamHandle() ) { - _data = readBlobFromStream( getStreamHandle().toLocalTaskQueue() ); + else if( getStreamHandle() != null ) { + _data = readBlobFromStream( getStreamHandle() ); } else if( getRDDHandle()==null || getRDDHandle().allowsShortCircuitRead() ) { if( DMLScript.STATISTICS ) CacheStatistics.incrementHDFSHits(); - + //check filename if( _hdfsFileName == null ) throw new DMLRuntimeException("Cannot read matrix for empty filename."); - + //read cacheable data from hdfs _data = readBlobFromHDFS( _hdfsFileName ); - + //mark for initial local write despite read operation _requiresLocalWrite = false; } @@ -646,11 +619,11 @@ else if( getRDDHandle()==null || getRDDHandle().allowsShortCircuitRead() ) { //read matrix from rdd (incl execute pending rdd operations) MutableBoolean writeStatus = new MutableBoolean(); _data = readBlobFromRDD( getRDDHandle(), writeStatus ); - + //mark for initial local write (prevent repeated execution of rdd operations) _requiresLocalWrite = !writeStatus.booleanValue(); } - + setDirty(false); } catch (IOException e) { @@ -667,7 +640,7 @@ else if( _data!=null && DMLScript.STATISTICS ) { return _data; } - + /** * Acquires the exclusive "write" lock for a thread that wants to throw away the * old cache block data and link up with new cache block data. Abandons the old data @@ -675,93 +648,93 @@ else if( _data!=null && DMLScript.STATISTICS ) { * In-Status: EMPTY, EVICTABLE, EVICTED; * Out-Status: MODIFY. - * + * * @param newData new data * @return cacheable data */ public T acquireModify(T newData) { long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0; - + //core internal acquire (synchronized per object) T ret = acquireModifyIntern(newData); - + //update thread-local status (after pin but outside the //critical section of accessing a shared object) if( !isBelowCachingThreshold() ) updateStatusPinned(true); - + if( DMLScript.STATISTICS ){ long t1 = System.nanoTime(); CacheStatistics.incrementAcquireMTime(t1-t0); if (DMLScript.JMLC_MEM_STATISTICS) Statistics.addCPMemObject(System.identityHashCode(this), getDataSize()); } - + if(newData instanceof CompressedMatrixBlock) { setCompressedSize(newData.getInMemorySize()); } return ret; } - + private synchronized T acquireModifyIntern(T newData) { if (! isAvailableToModify ()) throw new DMLRuntimeException("CacheableData not available to modify."); - + //clear old data clearData(); - + //cache status maintenance acquire (true, false); //no need to load evicted matrix - + setDirty(true); _isAcquireFromEmpty = false; - + //set references to new data if (newData == null) throw new DMLRuntimeException("acquireModify with empty cache block."); return _data = newData; } - + /** * Releases the shared ("read-only") or exclusive ("write") lock. Updates * size information, last-access time, metadata, etc. - * + * * Synchronized because there might be parallel threads (parfor local) that * access the same object (in case it was created before the loop). - * + * * In-Status: READ, MODIFY; * Out-Status: READ(-1), EVICTABLE, EMPTY. - * + * */ public void release() { long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0; - + //update thread-local status (before unpin but outside //the critical section of accessing a shared object) if( !isBelowCachingThreshold() ) updateStatusPinned(false); - + //core internal release (synchronized per object) releaseIntern(); - + if( DMLScript.STATISTICS ){ long t1 = System.nanoTime(); CacheStatistics.incrementReleaseTime(t1-t0); } } - + private synchronized void releaseIntern() { boolean write = false; if ( isModify() ) { //set flags for write write = true; setDirty(true); - + //update meta data refreshMetaData(); - - //compact empty in-memory block + + //compact empty in-memory block _data.compactEmptyBlock(); } @@ -771,7 +744,7 @@ private synchronized void releaseIntern() { //cache status maintenance (pass cacheNoWrite flag) release(_isAcquireFromEmpty && !_requiresLocalWrite); - + if( isCachingActive() //only if caching is enabled (otherwise keep everything in mem) && isCached(true) //not empty and not read/modify && !isBelowCachingThreshold() ) //min size for caching @@ -793,39 +766,39 @@ && isCached(true) //not empty and not read/modify if( DMLScript.STATISTICS && write && hasValidLineage() ) CacheStatistics.incrementLinWrites(); - + //create cache createCache(); _data = null; } } - + public void clearData() { clearData(-1); } - + /** * Sets the cache block reference to null, abandons the old block. * Makes the "envelope" empty. Run it to finalize the object (otherwise the * evicted cache block file may remain undeleted). - * + * * In-Status: EMPTY, EVICTABLE, EVICTED; * Out-Status: EMPTY. - * + * * @param tid thread ID - * + * */ - public synchronized void clearData(long tid) + public synchronized void clearData(long tid) { - // check if cleanup enabled and possible - if( !isCleanupEnabled() ) + // check if cleanup enabled and possible + if( !isCleanupEnabled() ) return; // do nothing if( !isAvailableToModify() ) throw new DMLRuntimeException("CacheableData (" + getDebugName() + ") not available to " + "modify. Status = " + _cacheStatus.name() + "."); - + // clear existing WB / FS representation (but prevent unnecessary probes) - if( !(isEmpty(true)||(_data!=null && isBelowCachingThreshold()) + if( !(isEmpty(true)||(_data!=null && isBelowCachingThreshold()) ||(_data!=null && !isCachingActive()) )) //additional condition for JMLC freeEvictedBlob(); @@ -833,7 +806,7 @@ public synchronized void clearData(long tid) _data = null; clearCache(); setCacheLineage(null); - + // clear rdd/broadcast back refs if( _rddHandle != null ) _rddHandle.setBackReference(null); @@ -845,11 +818,11 @@ public synchronized void clearData(long tid) gObj.clearData(null, DMLScript.EAGER_CUDA_FREE); } } - + //clear federated matrix if( _fedMapping != null ) _fedMapping.execCleanup(tid, _fedMapping.getID()); - + // change object state EMPTY setDirty(false); setEmpty(); @@ -858,13 +831,13 @@ public synchronized void clearData(long tid) public synchronized void exportData() { exportData( -1 ); } - + /** * Writes, or flushes, the cache block data to HDFS. - * + * * In-Status: EMPTY, EVICTABLE, EVICTED, READ; * Out-Status: EMPTY, EVICTABLE, EVICTED, READ. - * + * * @param replication ? */ public synchronized void exportData( int replication ) { @@ -878,18 +851,18 @@ public synchronized void exportData(String fName, String outputFormat) { public synchronized void exportData(String fName, String outputFormat, FileFormatProperties formatProperties) { exportData(fName, outputFormat, -1, formatProperties); } - + /** * Synchronized because there might be parallel threads (parfor local) that * access the same object (in case it was created before the loop). * If all threads export the same data object concurrently it results in errors * because they all write to the same file. Efficiency for loops and parallel threads * is achieved by checking if the in-memory block is dirty. - * + * * NOTE: MB: we do not use dfs copy from local (evicted) to HDFS because this would ignore * the output format and most importantly would bypass reblocking during write (which effects the - * potential degree of parallelism). However, we copy files on HDFS if certain criteria are given. - * + * potential degree of parallelism). However, we copy files on HDFS if certain criteria are given. + * * @param fName file name * @param outputFormat format * @param replication ? @@ -905,7 +878,7 @@ public synchronized void exportData (String fName, String outputFormat, int repl if( LOG.isTraceEnabled() ) LOG.trace("Exporting " + this.getDebugName() + " to " + fName + " in format " + outputFormat); - + if( DMLScript.USE_ACCELERATOR && _gpuObjects != null ) { boolean copiedFromGPU = false; for (Map.Entry kv : _gpuObjects.entrySet()) { @@ -919,12 +892,12 @@ public synchronized void exportData (String fName, String outputFormat, int repl } } } - + //check for persistent or transient writes boolean pWrite = !fName.equals(_hdfsFileName); if( !pWrite ) setHDFSFileExists(true); - + //check for common file scheme (otherwise no copy/rename) int blen = (formatProperties == null) ? ConfigurationManager.getBlocksize() : formatProperties.getBlocksize(); @@ -933,7 +906,7 @@ public synchronized void exportData (String fName, String outputFormat, int repl boolean eqFormat = isEqualOutputFormat(outputFormat); boolean eqBlksize = (getBlocksize() != blen) && (outputFormat == null || outputFormat.equals("binary")); - + //actual export (note: no direct transfer of local copy in order to ensure blocking (and hence, parallelism)) if( isDirty() || !eqScheme || isFederated() || (pWrite && (!eqFormat | !eqBlksize)) ) @@ -957,7 +930,7 @@ public synchronized void exportData (String fName, String outputFormat, int repl if( isEmpty(true) && !federatedWrite) { //read data from HDFS if required (never read before), this applies only to pWrite w/ different output formats - //note: for large rdd outputs, we compile dedicated writespinstructions (no need to handle this here) + //note: for large rdd outputs, we compile dedicated writespinstructions (no need to handle this here) try { if( getRDDHandle()==null || getRDDHandle().allowsShortCircuitRead() ) _data = readBlobFromHDFS( _hdfsFileName ); @@ -972,15 +945,15 @@ else if(!federatedWrite) throw new DMLRuntimeException("Reading of " + _hdfsFileName + " ("+hashCode()+") failed.", e); } } - + //get object from cache if(!federatedWrite) { if( _data == null ) getCache(); acquire( false, _data==null ); //incl. read matrix if evicted } - - // b) write the matrix + + // b) write the matrix try { writeMetaData( fName, outputFormat, formatProperties ); writeBlobToHDFS( fName, outputFormat, replication, formatProperties ); @@ -1014,7 +987,7 @@ else if( pWrite ) // pwrite with same output format } } else if( getRDDHandle()!=null && getRDDHandle().isPending() - && !getRDDHandle().isHDFSFile() + && !getRDDHandle().isHDFSFile() && !getRDDHandle().allowsShortCircuitRead() ) { //CASE 3: pending rdd operation (other than checkpoints) @@ -1031,25 +1004,25 @@ else if( getRDDHandle()!=null && getRDDHandle().isPending() throw new DMLRuntimeException("Export to " + fName + " failed.", e); } } - else + else { //CASE 4: data already in hdfs (do nothing, no need for export) if( LOG.isTraceEnabled() ) LOG.trace(this.getDebugName() + ": Skip export to hdfs since data already exists."); } - + _hdfsFileExists = true; if( DMLScript.STATISTICS ){ long t1 = System.nanoTime(); CacheStatistics.incrementExportTime(t1-t0); } } - + // --------- ABSTRACT LOW-LEVEL CACHE I/O OPERATIONS ---------- /** * Checks if the data blob reference points to some in-memory object. - * This method is called when releasing the (last) lock. Do not call + * This method is called when releasing the (last) lock. Do not call * this method for a blob that has been evicted. * * @return true if the blob is in main memory and the @@ -1068,11 +1041,11 @@ protected boolean isBlobPresent() { protected void restoreBlobIntoMemory() { String cacheFilePathAndName = getCacheFilePathAndName(); long begin = LOG.isTraceEnabled() ? System.currentTimeMillis() : 0; - + if( LOG.isTraceEnabled() ) - LOG.trace ("CACHE: Restoring matrix... " + hashCode() + " HDFS path: " + + LOG.trace ("CACHE: Restoring matrix... " + hashCode() + " HDFS path: " + (_hdfsFileName == null ? "null" : _hdfsFileName) + ", Restore from path: " + cacheFilePathAndName); - + if (_data != null) throw new DMLRuntimeException(cacheFilePathAndName + " : Cannot restore on top of existing in-memory data."); @@ -1080,20 +1053,20 @@ protected void restoreBlobIntoMemory() { _data = readBlobFromCache(cacheFilePathAndName); } catch (IOException e) { - throw new DMLRuntimeException(cacheFilePathAndName + " : Restore failed.", e); + throw new DMLRuntimeException(cacheFilePathAndName + " : Restore failed.", e); } - + //check for success if (_data == null) throw new DMLRuntimeException (cacheFilePathAndName + " : Restore failed."); - + if( LOG.isTraceEnabled() ) LOG.trace("Restoring matrix - COMPLETED ... " + (System.currentTimeMillis()-begin) + " msec."); } protected abstract T readBlobFromCache(String fname) throws IOException; - + /** * Low-level cache I/O method that deletes the file containing the * evicted data blob, without reading it. @@ -1103,16 +1076,16 @@ public final void freeEvictedBlob() { String cacheFilePathAndName = getCacheFilePathAndName(); long begin = LOG.isTraceEnabled() ? System.currentTimeMillis() : 0; if( LOG.isTraceEnabled() ) - LOG.trace("CACHE: Freeing evicted matrix... " + hashCode() + " HDFS path: " + + LOG.trace("CACHE: Freeing evicted matrix... " + hashCode() + " HDFS path: " + (_hdfsFileName == null ? "null" : _hdfsFileName) + " Eviction path: " + cacheFilePathAndName); - + if(isCachingActive()) { if (OptimizerUtils.isUMMEnabled()) UnifiedMemoryManager.deleteBlock(cacheFilePathAndName); else LazyWriteBuffer.deleteBlock(cacheFilePathAndName); } - + if( LOG.isTraceEnabled() ) LOG.trace("Freeing evicted matrix - COMPLETED ... " + (System.currentTimeMillis()-begin) + " msec."); } @@ -1120,7 +1093,7 @@ public final void freeEvictedBlob() { protected boolean isBelowCachingThreshold() { return (_data.getInMemorySize() <= CACHING_THRESHOLD); } - + public static boolean isBelowCachingThreshold(CacheBlock data) { boolean ret; if (OptimizerUtils.isUMMEnabled()) @@ -1129,11 +1102,11 @@ public static boolean isBelowCachingThreshold(CacheBlock data) { ret = LazyWriteBuffer.getCacheBlockSize(data) <= CACHING_THRESHOLD; return ret; } - + public long getDataSize() { return (_data != null) ?_data.getInMemorySize() : 0; } - + protected ValueType[] getSchema() { return null; } @@ -1141,8 +1114,8 @@ protected ValueType[] getSchema() { @Override //Data public synchronized String getDebugName() { int maxLength = 23; - String debugNameEnding = (_hdfsFileName == null ? "null" : - (_hdfsFileName.length() < maxLength ? _hdfsFileName : "..." + + String debugNameEnding = (_hdfsFileName == null ? "null" : + (_hdfsFileName.length() < maxLength ? _hdfsFileName : "..." + _hdfsFileName.substring (_hdfsFileName.length() - maxLength + 3))); return hashCode() + " " + debugNameEnding; } @@ -1172,7 +1145,7 @@ protected T readBlobFromFederated(FederationMap fedMap) throws IOException { DataCharacteristics dc = iimd.getDataCharacteristics(); return readBlobFromFederated(fedMap, dc.getDims()); } - + protected abstract T readBlobFromFederated(FederationMap fedMap, long[] dims) throws IOException; @@ -1181,22 +1154,22 @@ protected abstract void writeBlobToHDFS(String fname, String ofmt, int rep, File protected abstract long writeStreamToHDFS(String fname, String ofmt, int rep, FileFormatProperties fprop) throws IOException; - + protected abstract void writeBlobFromRDDtoHDFS(RDDObject rdd, String fname, String ofmt) throws IOException; protected abstract T reconstructByLineage(LineageItem li) throws IOException; - + protected void writeMetaData (String filePathAndName, String outputFormat, FileFormatProperties formatProperties) throws IOException - { + { MetaDataFormat iimd = (MetaDataFormat) _metaData; - + if (iimd == null) throw new DMLRuntimeException("Unexpected error while writing mtd file (" + filePathAndName + ") -- metadata is null."); - + // Write the matrix to HDFS in requested format FileFormat fmt = (outputFormat != null) ? FileFormat.safeValueOf(outputFormat) : iimd.getFileFormat(); if ( fmt != FileFormat.MM ) { @@ -1204,15 +1177,15 @@ protected void writeMetaData (String filePathAndName, String outputFormat, FileF DataCharacteristics dc = iimd.getDataCharacteristics(); if( formatProperties != null && formatProperties.knownBlocksize() ) dc.setBlocksize(formatProperties.getBlocksize()); - + // when outputFormat is binaryblock, make sure that matrixCharacteristics has correct blocking dimensions - // note: this is only required if singlenode (due to binarycell default) + // note: this is only required if singlenode (due to binarycell default) if ( fmt == FileFormat.BINARY && DMLScript.getGlobalExecMode() == ExecMode.SINGLE_NODE && dc.getBlocksize() != ConfigurationManager.getBlocksize() ) { dc = new MatrixCharacteristics(dc.getRows(), dc.getCols(), dc.getBlocksize(), dc.getNonZeros()); } - + //write the actual meta data file HDFSTool.writeMetaDataFile (filePathAndName + ".mtd", valueType, getSchema(), dataType, dc, fmt, formatProperties); @@ -1226,9 +1199,9 @@ protected boolean isEqualOutputFormat(String outputFormat) { } return true; } - + // ------------- IMPLEMENTED CACHE LOGIC METHODS -------------- - + protected String getCacheFilePathAndName () { if( _cacheFileName==null ) { StringBuilder sb = new StringBuilder(); @@ -1238,15 +1211,15 @@ protected String getCacheFilePathAndName () { sb.append(CacheableData.CACHING_EVICTION_FILEEXTENSION); _cacheFileName = sb.toString(); } - + return _cacheFileName; } - + /** * This method "acquires the lock" to ensure that the data blob is in main memory * (not evicted) while it is being accessed. When called, the method will try to * restore the blob if it has been evicted. There are two kinds of locks it may - * acquire: a shared "read" lock (if the argument is false) or the + * acquire: a shared "read" lock (if the argument is false) or the * exclusive "modify" lock (if the argument is true). * The method can fail in three ways: * (1) if there is lock status conflict; @@ -1256,9 +1229,9 @@ protected String getCacheFilePathAndName () { * its last-access timestamp. For the shared "read" lock, acquiring a new lock * increments the associated count. The "read" count has to be decremented once * the blob is no longer used, which may re-enable eviction. This method has to - * be called only once per matrix operation and coupled with {@link #release()}, + * be called only once per matrix operation and coupled with {@link #release()}, * because it increments the lock count and the other method decrements this count. - * + * * @param isModify : true for the exclusive "modify" lock, * false for a shared "read" lock. * @param restore true if restore @@ -1290,7 +1263,7 @@ protected void acquire (boolean isModify, boolean restore) { LOG.trace("Acquired lock on " + getDebugName() + ", status: " + _cacheStatus.name() ); } - + /** * Call this method to permit eviction for the stored data blob, or to * decrement its "read" count if it is "read"-locked by other threads. @@ -1300,7 +1273,7 @@ protected void acquire (boolean isModify, boolean restore) { * called only once per process and coupled with {@link #acquire(boolean, boolean)}, * because it decrements the lock count and the other method increments * the lock count. - * + * * @param cacheNoWrite ? */ protected void release(boolean cacheNoWrite) @@ -1321,37 +1294,37 @@ protected void release(boolean cacheNoWrite) setEmpty(); break; } - + if( LOG.isTraceEnabled() ) LOG.trace("Released lock on " + getDebugName() + ", status: " + _cacheStatus.name()); - + } - + // ************************************************** // *** *** // *** CACHE STATUS FIELD - CLASSES AND METHODS *** // *** *** // ************************************************** - + public boolean isCached(boolean inclCachedNoWrite) { return _cacheStatus == CacheStatus.CACHED || (inclCachedNoWrite && _cacheStatus == CacheStatus.CACHED_NOWRITE); } - + public void setEmptyStatus() { setEmpty(); } - + protected boolean isEmpty(boolean inclCachedNoWrite) { return _cacheStatus == CacheStatus.EMPTY || (inclCachedNoWrite && _cacheStatus == CacheStatus.CACHED_NOWRITE); } - + protected boolean isModify() { return (_cacheStatus == CacheStatus.MODIFY); } - + public boolean isPendingRDDOps() { return isEmpty(true) && _data == null && (_rddHandle != null && _rddHandle.hasBackReference()); } @@ -1364,11 +1337,11 @@ public boolean isDeviceToHostCopy() { protected void setEmpty() { _cacheStatus = CacheStatus.EMPTY; } - + protected void setModify() { _cacheStatus = CacheStatus.MODIFY; } - + protected void setCached() { _cacheStatus = CacheStatus.CACHED; } @@ -1377,25 +1350,25 @@ protected void addOneRead() { _numReadThreads ++; _cacheStatus = CacheStatus.READ; } - + protected void removeOneRead(boolean doesBlobExist, boolean cacheNoWrite) { _numReadThreads --; if (_numReadThreads == 0) { if( cacheNoWrite ) - _cacheStatus = (doesBlobExist ? + _cacheStatus = (doesBlobExist ? CacheStatus.CACHED_NOWRITE : CacheStatus.EMPTY); else - _cacheStatus = (doesBlobExist ? + _cacheStatus = (doesBlobExist ? CacheStatus.CACHED : CacheStatus.EMPTY); } } - + protected boolean isAvailableToRead() { return (_cacheStatus != CacheStatus.MODIFY); } - + protected boolean isAvailableToModify() { - return ( _cacheStatus == CacheStatus.EMPTY + return ( _cacheStatus == CacheStatus.EMPTY || _cacheStatus == CacheStatus.CACHED || _cacheStatus == CacheStatus.CACHED_NOWRITE); } @@ -1406,10 +1379,10 @@ protected boolean isAvailableToModify() { // *** FOR SOFTREFERENCE CACHE *** // *** *** // ******************************************* - + /** * Creates a new cache soft reference to the currently - * referenced cache block. + * referenced cache block. */ protected void createCache( ) { if( _cache == null || _cache.get() == null ) @@ -1425,7 +1398,7 @@ protected void getCache() { _data = _cache.get(); } } - + /** Clears the cache soft reference if existing. */ protected void clearCache() { if( _cache != null ) { @@ -1445,39 +1418,39 @@ protected void updateStatusPinned(boolean add) { protected static long getPinnedSize() { return sizePinned.get(); } - + public static void addBroadcastSize(long size) { _refBCs.addAndGet(size); } - + public static long getBroadcastSize() { //scale the total sum of all broadcasts by the current fraction //of local memory to equally distribute it across parfor workers return (long) (_refBCs.longValue() * InfrastructureAnalyzer.getLocalMaxMemoryFraction()); } - + // --------- STATIC CACHE INIT/CLEANUP OPERATIONS ---------- public synchronized static void cleanupCacheDir() { //cleanup remaining cached writes LazyWriteBuffer.cleanup(); UnifiedMemoryManager.cleanup(); - + //delete cache dir and files cleanupCacheDir(true); } - + /** * Deletes the DML-script-specific caching working dir. - * + * * @param withDir if true, delete directory */ public synchronized static void cleanupCacheDir(boolean withDir) { //get directory name String dir = cacheEvictionLocalFilePath; - + //clean files with cache prefix if( dir != null ) //if previous init cache { @@ -1491,30 +1464,30 @@ public synchronized static void cleanupCacheDir(boolean withDir) fdir.delete(); //deletes dir only if empty } } - + _activeFlag = false; } - + /** * Inits caching with the default uuid of DMLScript - * + * * @throws IOException if IOException occurs */ - public synchronized static void initCaching() + public synchronized static void initCaching() throws IOException { initCaching(DMLScript.getUUID()); } - + /** * Creates the DML-script-specific caching working dir. - * + * * Takes the UUID in order to allow for custom uuid, e.g., for remote parfor caching - * + * * @param uuid ID * @throws IOException if IOException occurs */ - public synchronized static void initCaching( String uuid ) + public synchronized static void initCaching( String uuid ) throws IOException { try @@ -1527,7 +1500,7 @@ public synchronized static void initCaching( String uuid ) { throw new IOException(e); } - + if (OptimizerUtils.isUMMEnabled()) //init unified memory manager UnifiedMemoryManager.init(); @@ -1542,26 +1515,26 @@ public synchronized static void initCaching( String uuid ) public static boolean isCachingActive() { return _activeFlag; } - + public static void disableCaching() { _activeFlag = false; } - + public static void enableCaching() { _activeFlag = true; } public synchronized boolean moveData(String fName, String outputFormat) { boolean ret = false; - + try { //check for common file scheme (otherwise no copy/rename) boolean eqScheme = IOUtilFunctions.isSameFileScheme( new Path(_hdfsFileName), new Path(fName)); - + //export or rename to target file on hdfs - if( isDirty() || !eqScheme || (!isEqualOutputFormat(outputFormat) && isEmpty(true)) + if( isDirty() || !eqScheme || (!isEqualOutputFormat(outputFormat) && isEmpty(true)) || (getRDDHandle()!=null && !HDFSTool.existsFileOnHDFS(_hdfsFileName)) ) { exportData(fName, outputFormat); @@ -1579,7 +1552,7 @@ else if( isEqualOutputFormat(outputFormat) ) catch (Exception e) { throw new DMLRuntimeException("Move to " + fName + " failed.", e); } - + return ret; } @@ -1587,7 +1560,7 @@ else if( isEqualOutputFormat(outputFormat) ) public String toString() { return toString(false); } - + @Override public String toString(boolean metaOnly) { StringBuilder str = new StringBuilder(); diff --git a/src/main/java/org/apache/sysds/runtime/instructions/CPInstructionParser.java b/src/main/java/org/apache/sysds/runtime/instructions/CPInstructionParser.java index 92e11b425dd..b44e06ad2d0 100644 --- a/src/main/java/org/apache/sysds/runtime/instructions/CPInstructionParser.java +++ b/src/main/java/org/apache/sysds/runtime/instructions/CPInstructionParser.java @@ -82,92 +82,92 @@ public static CPInstruction parseSingleInstruction (String str ) { throw new DMLRuntimeException("Unable to parse instruction: " + str); return cpinst; } - + public static CPInstruction parseSingleInstruction ( InstructionType cptype, String str ) { ExecType execType; - if ( str == null || str.isEmpty() ) + if ( str == null || str.isEmpty() ) return null; switch(cptype) { case AggregateUnary: return AggregateUnaryCPInstruction.parseInstruction(str); - + case AggregateBinary: return AggregateBinaryCPInstruction.parseInstruction(str); - + case AggregateTernary: return AggregateTernaryCPInstruction.parseInstruction(str); - + case Unary: return UnaryCPInstruction.parseInstruction(str); case Binary: return BinaryCPInstruction.parseInstruction(str); - + case Ternary: return TernaryCPInstruction.parseInstruction(str); - + case Quaternary: return QuaternaryCPInstruction.parseInstruction(str); - + case BuiltinNary: return BuiltinNaryCPInstruction.parseInstruction(str); - + case Ctable: return CtableCPInstruction.parseInstruction(str); - + case Reorg: return ReorgCPInstruction.parseInstruction(str); - + case Dnn: return DnnCPInstruction.parseInstruction(str); - + case UaggOuterChain: return UaggOuterChainCPInstruction.parseInstruction(str); - + case Reshape: return ReshapeCPInstruction.parseInstruction(str); - + case Append: return AppendCPInstruction.parseInstruction(str); - + case Variable: return VariableCPInstruction.parseInstruction(str); - + case Rand: return DataGenCPInstruction.parseInstruction(str); case StringInit: return StringInitCPInstruction.parseInstruction(str); - + case FCall: return FunctionCallCPInstruction.parseInstruction(str); case ParameterizedBuiltin: return ParameterizedBuiltinCPInstruction.parseInstruction(str); - + case MultiReturnParameterizedBuiltin: return MultiReturnParameterizedBuiltinCPInstruction.parseInstruction(str); - + case MultiReturnComplexMatrixBuiltin: return MultiReturnComplexMatrixBuiltinCPInstruction.parseInstruction(str); - + case MultiReturnBuiltin: return MultiReturnBuiltinCPInstruction.parseInstruction(str); - + case QSort: return QuantileSortCPInstruction.parseInstruction(str); - + case QPick: return QuantilePickCPInstruction.parseInstruction(str); - + case MatrixIndexing: - execType = ExecType.valueOf( str.split(Instruction.OPERAND_DELIM)[0] ); + execType = ExecType.valueOf( str.split(Instruction.OPERAND_DELIM)[0] ); if( execType == ExecType.CP ) return IndexingCPInstruction.parseInstruction(str); else //exectype CP_FILE return MatrixIndexingCPFileInstruction.parseInstruction(str); - - case Builtin: + + case Builtin: String[] parts = InstructionUtils.getInstructionPartsWithValueType(str); if(parts[0].equals(Opcodes.LOG.toString()) || parts[0].equals(Opcodes.LOGNZ.toString())) { if(InstructionUtils.isInteger(parts[3])) // B=log(A), y=log(x) @@ -177,44 +177,44 @@ public static CPInstruction parseSingleInstruction ( InstructionType cptype, Str return BinaryCPInstruction.parseInstruction(str); } throw new DMLRuntimeException("Invalid Builtin Instruction: " + str ); - + case MMTSJ: return MMTSJCPInstruction.parseInstruction(str); - + case PMMJ: return PMMJCPInstruction.parseInstruction(str); - + case MMChain: return MMChainCPInstruction.parseInstruction(str); - + case CentralMoment: return CentralMomentCPInstruction.parseInstruction(str); - + case Covariance: return CovarianceCPInstruction.parseInstruction(str); case Compression: return CompressionCPInstruction.parseInstruction(str); - + case DeCompression: return DeCompressionCPInstruction.parseInstruction(str); - + case QuantizeCompression: LOG.debug("Parsing Quantize Compress instruction"); - return CompressionCPInstruction.parseQuantizationFusedInstruction(str); + return CompressionCPInstruction.parseQuantizationFusedInstruction(str); case Local: return LocalCPInstruction.parseInstruction(str); case SpoofFused: return SpoofCPInstruction.parseInstruction(str); - + case Sql: return SqlCPInstruction.parseInstruction(str); - + case Prefetch: return PrefetchCPInstruction.parseInstruction(str); - + case Broadcast: return BroadcastCPInstruction.parseInstruction(str); @@ -223,10 +223,10 @@ public static CPInstruction parseSingleInstruction ( InstructionType cptype, Str case Union: return UnionCPInstruction.parseInstruction(str); - + case EINSUM: return EinsumCPInstruction.parseInstruction(str); - + default: throw new DMLRuntimeException("Invalid CP Instruction Type: " + cptype ); } diff --git a/src/main/java/org/apache/sysds/runtime/instructions/OOCInstructionParser.java b/src/main/java/org/apache/sysds/runtime/instructions/OOCInstructionParser.java index f23ad6d67a6..8b64073111c 100644 --- a/src/main/java/org/apache/sysds/runtime/instructions/OOCInstructionParser.java +++ b/src/main/java/org/apache/sysds/runtime/instructions/OOCInstructionParser.java @@ -38,6 +38,8 @@ import org.apache.sysds.runtime.instructions.ooc.MatrixVectorBinaryOOCInstruction; import org.apache.sysds.runtime.instructions.ooc.TransposeOOCInstruction; import org.apache.sysds.runtime.instructions.ooc.TeeOOCInstruction; +import org.apache.sysds.runtime.instructions.ooc.OOCInstruction; +import org.apache.sysds.runtime.instructions.ooc.ReblockOOCInstruction; public class OOCInstructionParser extends InstructionParser { protected static final Log LOG = LogFactory.getLog(OOCInstructionParser.class.getName()); @@ -78,7 +80,7 @@ public static OOCInstruction parseSingleInstruction(InstructionType ooctype, Str case Tee: return TeeOOCInstruction.parseInstruction(str); case CentralMoment: - return CentralMomentOOCInstruction.parseInstruction(str); + return CentralMomentOOCInstruction.parseInstruction(str); case Ctable: return CtableOOCInstruction.parseInstruction(str); case ParameterizedBuiltin: diff --git a/src/main/java/org/apache/sysds/runtime/instructions/ooc/AggregateUnaryOOCInstruction.java b/src/main/java/org/apache/sysds/runtime/instructions/ooc/AggregateUnaryOOCInstruction.java index 2a53c5400ae..34df0f4d249 100644 --- a/src/main/java/org/apache/sysds/runtime/instructions/ooc/AggregateUnaryOOCInstruction.java +++ b/src/main/java/org/apache/sysds/runtime/instructions/ooc/AggregateUnaryOOCInstruction.java @@ -30,37 +30,27 @@ import org.apache.sysds.runtime.instructions.cp.DoubleObject; import org.apache.sysds.runtime.instructions.spark.data.IndexedMatrixValue; import org.apache.sysds.runtime.matrix.data.MatrixBlock; -import org.apache.sysds.runtime.matrix.data.MatrixIndexes; import org.apache.sysds.runtime.matrix.data.OperationsOnMatrixValues; import org.apache.sysds.runtime.matrix.operators.AggregateOperator; import org.apache.sysds.runtime.matrix.operators.AggregateUnaryOperator; -import org.apache.sysds.runtime.matrix.operators.Operator; -import org.apache.sysds.runtime.meta.DataCharacteristics; -import java.util.HashMap; public class AggregateUnaryOOCInstruction extends ComputationOOCInstruction { private AggregateOperator _aop = null; - protected AggregateUnaryOOCInstruction(OOCType type, AggregateUnaryOperator auop, AggregateOperator aop, + protected AggregateUnaryOOCInstruction(OOCType type, AggregateUnaryOperator auop, AggregateOperator aop, CPOperand in, CPOperand out, String opcode, String istr) { super(type, auop, in, out, opcode, istr); _aop = aop; } - protected AggregateUnaryOOCInstruction(OOCType type, Operator op, CPOperand in1, CPOperand in2, CPOperand in3, - CPOperand out, String opcode, String istr) { - super(type, op, in1, in2, in3, out, opcode, istr); - _aop = null; - } - public static AggregateUnaryOOCInstruction parseInstruction(String str) { String[] parts = InstructionUtils.getInstructionPartsWithValueType(str); InstructionUtils.checkNumFields(parts, 2); String opcode = parts[0]; CPOperand in1 = new CPOperand(parts[1]); CPOperand out = new CPOperand(parts[2]); - + String aopcode = InstructionUtils.deriveAggregateOperatorOpcode(opcode); CorrectionLocationType corrLoc = InstructionUtils.deriveAggregateOperatorCorrectionLocation(opcode); AggregateUnaryOperator aggun = InstructionUtils.parseBasicAggregateUnaryOperator(opcode); @@ -68,112 +58,37 @@ public static AggregateUnaryOOCInstruction parseInstruction(String str) { return new AggregateUnaryOOCInstruction( OOCType.AggregateUnary, aggun, aop, in1, out, opcode, str); } - + @Override public void processInstruction( ExecutionContext ec ) { - //TODO support all types of aggregations, currently only full aggregation, row aggregation and column aggregation - + //TODO support all types of aggregations, currently only full aggregation + //setup operators and input queue - AggregateUnaryOperator aggun = (AggregateUnaryOperator) getOperator(); + AggregateUnaryOperator aggun = (AggregateUnaryOperator) getOperator(); MatrixObject min = ec.getMatrixObject(input1); - OOCStream q = min.getStreamHandle(); + LocalTaskQueue q = min.getStreamHandle(); + IndexedMatrixValue tmp = null; int blen = ConfigurationManager.getBlocksize(); - if (aggun.isRowAggregate() || aggun.isColAggregate()) { - DataCharacteristics chars = ec.getDataCharacteristics(input1.getName()); - // number of blocks to process per aggregation idx (row or column dim) - long emitThreshold = aggun.isRowAggregate()? chars.getNumColBlocks() : chars.getNumRowBlocks(); - OOCMatrixBlockTracker aggTracker = new OOCMatrixBlockTracker(emitThreshold); - HashMap corrs = new HashMap<>(); // correction blocks - - OOCStream qOut = createWritableStream(); - ec.getMatrixObject(output).setStreamHandle(qOut); - - submitOOCTask(() -> { - IndexedMatrixValue tmp = null; - try { - while((tmp = q.dequeue()) != LocalTaskQueue.NO_MORE_TASKS) { - long idx = aggun.isRowAggregate() ? - tmp.getIndexes().getRowIndex() : tmp.getIndexes().getColumnIndex(); - MatrixBlock ret = aggTracker.get(idx); - if(ret != null) { - MatrixBlock corr = corrs.get(idx); - - // aggregation - MatrixBlock ltmp = (MatrixBlock) ((MatrixBlock) tmp.getValue()) - .aggregateUnaryOperations(aggun, new MatrixBlock(), blen, tmp.getIndexes()); - OperationsOnMatrixValues.incrementalAggregation(ret, - _aop.existsCorrection() ? corr : null, ltmp, _aop, true); - - if (!aggTracker.putAndIncrementCount(idx, ret)){ - corrs.replace(idx, corr); - continue; - } - } - else { - // first block for this idx - init aggregate and correction - // TODO avoid corr block for inplace incremental aggregation - int rows = tmp.getValue().getNumRows(); - int cols = tmp.getValue().getNumColumns(); - int extra = _aop.correction.getNumRemovedRowsColumns(); - ret = aggun.isRowAggregate()? new MatrixBlock(rows, 1 + extra, false) : new MatrixBlock(1 + extra, cols, false); - MatrixBlock corr = aggun.isRowAggregate()? new MatrixBlock(rows, 1 + extra, false) : new MatrixBlock(1 + extra, cols, false); - - // aggregation - MatrixBlock ltmp = (MatrixBlock) ((MatrixBlock) tmp.getValue()).aggregateUnaryOperations( - aggun, new MatrixBlock(), blen, tmp.getIndexes()); - OperationsOnMatrixValues.incrementalAggregation(ret, - _aop.existsCorrection() ? corr : null, ltmp, _aop, true); - - if(emitThreshold > 1){ - aggTracker.putAndIncrementCount(idx, ret); - corrs.put(idx, corr); - continue; - } - } - - // all input blocks for this idx processed - emit aggregated block - ret.dropLastRowsOrColumns(_aop.correction); - MatrixIndexes midx = aggun.isRowAggregate() ? - new MatrixIndexes(tmp.getIndexes().getRowIndex(), 1) : - new MatrixIndexes(1, tmp.getIndexes().getColumnIndex()); - IndexedMatrixValue tmpOut = new IndexedMatrixValue(midx, ret); - - qOut.enqueue(tmpOut); - // drop intermediate states - aggTracker.remove(idx); - corrs.remove(idx); - } - qOut.closeInput(); - } - catch(Exception ex) { - throw new DMLRuntimeException(ex); - } - }, q, qOut); - } - // full aggregation - else { - IndexedMatrixValue tmp = null; - //read blocks and aggregate immediately into result - int extra = _aop.correction.getNumRemovedRowsColumns(); - MatrixBlock ret = new MatrixBlock(1,1+extra,false); - MatrixBlock corr = new MatrixBlock(1,1+extra,false); - try { - while((tmp = q.dequeue()) != LocalTaskQueue.NO_MORE_TASKS) { - //block aggregation - MatrixBlock ltmp = (MatrixBlock) ((MatrixBlock) tmp.getValue()) - .aggregateUnaryOperations(aggun, new MatrixBlock(), blen, tmp.getIndexes()); - //accumulation into final result - OperationsOnMatrixValues.incrementalAggregation( - ret, _aop.existsCorrection() ? corr : null, ltmp, _aop, true); - } + //read blocks and aggregate immediately into result + int extra = _aop.correction.getNumRemovedRowsColumns(); + MatrixBlock ret = new MatrixBlock(1,1+extra,false); + MatrixBlock corr = new MatrixBlock(1,1+extra,false); + try { + while((tmp = q.dequeueTask()) != LocalTaskQueue.NO_MORE_TASKS) { + //block aggregation + MatrixBlock ltmp = (MatrixBlock) ((MatrixBlock) tmp.getValue()) + .aggregateUnaryOperations(aggun, new MatrixBlock(), blen, tmp.getIndexes()); + //accumulation into final result + OperationsOnMatrixValues.incrementalAggregation( + ret, _aop.existsCorrection() ? corr : null, ltmp, _aop, true); } - catch(Exception ex) { - throw new DMLRuntimeException(ex); - } - - //create scalar output - ec.setScalarOutput(output.getName(), new DoubleObject(ret.get(0, 0))); } + catch(Exception ex) { + throw new DMLRuntimeException(ex); + } + + //create scalar output + ec.setScalarOutput(output.getName(), new DoubleObject(ret.get(0, 0))); } } diff --git a/src/main/java/org/apache/sysds/runtime/instructions/ooc/ComputationOOCInstruction.java b/src/main/java/org/apache/sysds/runtime/instructions/ooc/ComputationOOCInstruction.java index 4dcdffcb0dc..bc5a4d841b4 100644 --- a/src/main/java/org/apache/sysds/runtime/instructions/ooc/ComputationOOCInstruction.java +++ b/src/main/java/org/apache/sysds/runtime/instructions/ooc/ComputationOOCInstruction.java @@ -33,7 +33,7 @@ protected ComputationOOCInstruction(OOCType type, Operator op, CPOperand in1, CP input3 = null; output = out; } - + protected ComputationOOCInstruction(OOCType type, Operator op, CPOperand in1, CPOperand in2, CPOperand out, String opcode, String istr) { super(type, op, opcode, istr); input1 = in1; diff --git a/src/main/java/org/apache/sysds/runtime/instructions/ooc/ReblockOOCInstruction.java b/src/main/java/org/apache/sysds/runtime/instructions/ooc/ReblockOOCInstruction.java index 74b15c9fb0e..1f7fce3b146 100644 --- a/src/main/java/org/apache/sysds/runtime/instructions/ooc/ReblockOOCInstruction.java +++ b/src/main/java/org/apache/sysds/runtime/instructions/ooc/ReblockOOCInstruction.java @@ -41,7 +41,7 @@ public class ReblockOOCInstruction extends ComputationOOCInstruction { private int blen; - private ReblockOOCInstruction(Operator op, CPOperand in, CPOperand out, + private ReblockOOCInstruction(Operator op, CPOperand in, CPOperand out, int br, int bc, String opcode, String instr) { super(OOCType.Reblock, op, in, out, opcode, instr); @@ -71,29 +71,29 @@ public void processInstruction(ExecutionContext ec) { //get the source format from the meta data //MetaDataFormat iimd = (MetaDataFormat) min.getMetaData(); - //TODO support other formats than binary - + //TODO support other formats than binary + //create queue, spawn thread for asynchronous reading, and return OOCStream q = createWritableStream(); submitOOCTask(() -> readBinaryBlock(q, min.getFileName()), q); - + MatrixObject mout = ec.getMatrixObject(output); mout.setStreamHandle(q); } - + @SuppressWarnings("resource") private void readBinaryBlock(OOCStream q, String fname) { try { //prepare file access - JobConf job = new JobConf(ConfigurationManager.getCachedJobConf()); - Path path = new Path( fname ); + JobConf job = new JobConf(ConfigurationManager.getCachedJobConf()); + Path path = new Path( fname ); FileSystem fs = IOUtilFunctions.getFileSystem(path, job); - + //check existence and non-empty file - MatrixReader.checkValidInputFile(fs, path); - + MatrixReader.checkValidInputFile(fs, path); + //core reading - for( Path lpath : IOUtilFunctions.getSequenceFilePaths(fs, path) ) { //1..N files + for( Path lpath : IOUtilFunctions.getSequenceFilePaths(fs, path) ) { //1..N files //directly read from sequence files (individual partfiles) try( SequenceFile.Reader reader = new SequenceFile .Reader(job, SequenceFile.Reader.file(lpath)) ) diff --git a/src/test/java/org/apache/sysds/test/functions/ooc/SumScalarMultiplicationTest.java b/src/test/java/org/apache/sysds/test/functions/ooc/SumScalarMultiplicationTest.java index f0d9228a533..2272588bab4 100644 --- a/src/test/java/org/apache/sysds/test/functions/ooc/SumScalarMultiplicationTest.java +++ b/src/test/java/org/apache/sysds/test/functions/ooc/SumScalarMultiplicationTest.java @@ -23,7 +23,6 @@ import org.apache.sysds.common.Types; import org.apache.sysds.common.Types.FileFormat; import org.apache.sysds.common.Types.ValueType; -import org.apache.sysds.hops.OptimizerUtils; import org.apache.sysds.runtime.instructions.Instruction; import org.apache.sysds.runtime.io.MatrixWriter; import org.apache.sysds.runtime.io.MatrixWriterFactory; @@ -58,26 +57,11 @@ public void setUp() { * Test the sum of scalar multiplication, "sum(X*7)", with OOC backend. */ @Test - public void testSumScalarMultNoRewrite() { - testSumScalarMult(false); - } - - /** - * Test the sum of scalar multiplication, "sum(X)*7", with OOC backend. - */ - @Test - public void testSumScalarMultRewrite() { - testSumScalarMult(true); - } - - - public void testSumScalarMult(boolean rewrite) - { + public void testSumScalarMult() { + Types.ExecMode platformOld = rtplatform; rtplatform = Types.ExecMode.SINGLE_NODE; - boolean oldRewrite = OptimizerUtils.ALLOW_ALGEBRAIC_SIMPLIFICATION; - OptimizerUtils.ALLOW_ALGEBRAIC_SIMPLIFICATION = rewrite; - + try { getAndLoadTestConfiguration(TEST_NAME); String HOME = SCRIPT_DIR + TEST_DIR; @@ -108,17 +92,16 @@ public void testSumScalarMult(boolean rewrite) String prefix = Instruction.OOC_INST_PREFIX; Assert.assertTrue("OOC wasn't used for RBLK", heavyHittersContainsString(prefix + Opcodes.RBLK)); - if(!rewrite) - Assert.assertTrue("OOC wasn't used for SUM", - heavyHittersContainsString(prefix + Opcodes.MULT)); Assert.assertTrue("OOC wasn't used for SUM", heavyHittersContainsString(prefix + Opcodes.UAKP)); + +// boolean usedOOCMult = Statistics.getCPHeavyHitterOpCodes().contains(prefix + Opcodes.MULT); +// Assert.assertTrue("OOC wasn't used for MULT", usedOOCMult); } catch(Exception ex) { Assert.fail(ex.getMessage()); } finally { - OptimizerUtils.ALLOW_ALGEBRAIC_SIMPLIFICATION = oldRewrite; resetExecMode(platformOld); } }