diff --git a/benchmarks/tpc/README.md b/benchmarks/tpc/README.md index eb1fb04791..319288df05 100644 --- a/benchmarks/tpc/README.md +++ b/benchmarks/tpc/README.md @@ -38,15 +38,17 @@ All benchmarks are run via `run.py`: python3 run.py --engine --benchmark [options] ``` -| Option | Description | -| -------------- | ------------------------------------------------ | -| `--engine` | Engine name (matches a TOML file in `engines/`) | -| `--benchmark` | `tpch` or `tpcds` | -| `--iterations` | Number of iterations (default: 1) | -| `--output` | Output directory (default: `.`) | -| `--query` | Run a single query number | -| `--no-restart` | Skip Spark master/worker restart | -| `--dry-run` | Print the spark-submit command without executing | +| Option | Description | +| -------------- | -------------------------------------------------------- | +| `--engine` | Engine name (matches a TOML file in `engines/`) | +| `--benchmark` | `tpch` or `tpcds` | +| `--iterations` | Number of iterations (default: 1) | +| `--output` | Output directory (default: `.`) | +| `--query` | Run a single query number | +| `--no-restart` | Skip Spark master/worker restart | +| `--dry-run` | Print the spark-submit command without executing | +| `--jfr` | Enable Java Flight Recorder profiling | +| `--jfr-dir` | Directory for JFR output files (default: `/results/jfr`) | Available engines: `spark`, `comet`, `comet-iceberg`, `gluten` @@ -363,3 +365,30 @@ python3 generate-comparison.py --benchmark tpch \ --title "TPC-H @ 100 GB: Parquet vs Iceberg" \ comet-tpch-*.json comet-iceberg-tpch-*.json ``` + +## Java Flight Recorder Profiling + +Use the `--jfr` flag to capture JFR profiles from the Spark driver and executors. +JFR is built into JDK 11+ so no additional dependencies are needed. + +```shell +python3 run.py --engine comet --benchmark tpch --jfr +``` + +JFR recordings are written to `/results/jfr/` by default (configurable with +`--jfr-dir`). The driver writes `driver.jfr` and each executor writes +`executor.jfr` (JFR appends the PID when multiple executors share a path). + +With Docker Compose, the `/results` volume is shared across all containers, +so JFR files from both driver and executors are collected in +`$RESULTS_DIR/jfr/` on the host: + +```shell +docker compose -f benchmarks/tpc/infra/docker/docker-compose.yml \ + run --rm bench \ + python3 /opt/benchmarks/run.py \ + --engine comet --benchmark tpch --output /results --no-restart --jfr +``` + +Open the `.jfr` files with [JDK Mission Control](https://jdk.java.net/jmc/), +IntelliJ IDEA's profiler, or `jfr` CLI tool (`jfr summary driver.jfr`). diff --git a/benchmarks/tpc/infra/docker/docker-compose-laptop.yml b/benchmarks/tpc/infra/docker/docker-compose-laptop.yml index bc882ae7b9..7272684065 100644 --- a/benchmarks/tpc/infra/docker/docker-compose-laptop.yml +++ b/benchmarks/tpc/infra/docker/docker-compose-laptop.yml @@ -72,6 +72,7 @@ services: - SPARK_NO_DAEMONIZE=true mem_limit: 8g memswap_limit: 8g + stop_grace_period: 30s bench: image: ${BENCH_IMAGE:-comet-bench} diff --git a/benchmarks/tpc/infra/docker/docker-compose.yml b/benchmarks/tpc/infra/docker/docker-compose.yml index 5a76a5d6ec..f5c9f0ebe9 100644 --- a/benchmarks/tpc/infra/docker/docker-compose.yml +++ b/benchmarks/tpc/infra/docker/docker-compose.yml @@ -56,6 +56,7 @@ x-worker: &worker - SPARK_NO_DAEMONIZE=true mem_limit: ${WORKER_MEM_LIMIT:-32g} memswap_limit: ${WORKER_MEM_LIMIT:-32g} + stop_grace_period: 30s services: spark-master: diff --git a/benchmarks/tpc/run.py b/benchmarks/tpc/run.py index 38b0ed500b..58afc0bbea 100755 --- a/benchmarks/tpc/run.py +++ b/benchmarks/tpc/run.py @@ -261,6 +261,24 @@ def build_spark_submit_cmd(config, benchmark, args): val = "true" if val else "false" conf[resolve_env(key)] = resolve_env(str(val)) + # JFR profiling: append to extraJavaOptions (preserving any existing values) + if args.jfr: + jfr_dir = args.jfr_dir + driver_jfr = ( + f"-XX:StartFlightRecording=disk=true,dumponexit=true," + f"filename={jfr_dir}/driver.jfr,settings=profile" + ) + executor_jfr = ( + f"-XX:StartFlightRecording=disk=true,dumponexit=true," + f"filename={jfr_dir}/executor.jfr,settings=profile" + ) + for spark_key, jfr_opts in [ + ("spark.driver.extraJavaOptions", driver_jfr), + ("spark.executor.extraJavaOptions", executor_jfr), + ]: + existing = conf.get(spark_key, "") + conf[spark_key] = f"{existing} {jfr_opts}".strip() + for key, val in sorted(conf.items()): cmd += ["--conf", f"{key}={val}"] @@ -357,6 +375,16 @@ def main(): action="store_true", help="Print the spark-submit command without executing", ) + parser.add_argument( + "--jfr", + action="store_true", + help="Enable Java Flight Recorder profiling for driver and executors", + ) + parser.add_argument( + "--jfr-dir", + default="/results/jfr", + help="Directory for JFR output files (default: /results/jfr)", + ) args = parser.parse_args() config = load_engine_config(args.engine) @@ -373,6 +401,10 @@ def main(): if not args.no_restart and not args.dry_run: restart_spark() + # Create JFR output directory if profiling is enabled + if args.jfr: + os.makedirs(args.jfr_dir, exist_ok=True) + cmd = build_spark_submit_cmd(config, args.benchmark, args) if args.dry_run: