Recursively scan 14 quarterly Parquet files across year-based subdirectories with file_filter, max_rows, row_group_filter, and file_metadata for 73,089-row analytics.
-- ============================================================================
-- SETUP
-- ============================================================================
CREATE ZONE IF NOT EXISTS {{zone_name}} TYPE EXTERNAL;
CREATE SCHEMA IF NOT EXISTS {{zone_name}}.parquet_demos;
-- Table 1: recursive scan, predicate pushdown enabled
CREATE EXTERNAL TABLE IF NOT EXISTS {{zone_name}}.parquet_demos.all_orders
USING PARQUET
LOCATION '{{data_path}}/orders'
OPTIONS (
recursive = 'true',
row_group_filter = 'true',
file_metadata = '{"columns":["df_file_name","df_row_number"]}'
);
-- Table 2: one year via glob
CREATE EXTERNAL TABLE IF NOT EXISTS {{zone_name}}.parquet_demos.orders_2015
USING PARQUET
LOCATION '{{data_path}}/orders'
OPTIONS (
recursive = 'true',
file_filter = 'Orders_2015*',
file_metadata = '{"columns":["df_file_name","df_row_number"]}'
);
-- Table 3: balanced sample (100 rows per file)
CREATE EXTERNAL TABLE IF NOT EXISTS {{zone_name}}.parquet_demos.orders_sample
USING PARQUET
LOCATION '{{data_path}}/orders'
OPTIONS (
recursive = 'true',
max_rows = '100',
file_metadata = '{"columns":["df_file_name","df_row_number"]}'
);
-- Table 4: single-quarter drill-down
CREATE EXTERNAL TABLE IF NOT EXISTS {{zone_name}}.parquet_demos.orders_q1_2014
USING PARQUET
LOCATION '{{data_path}}/orders'
OPTIONS (
recursive = 'true',
file_filter = 'Orders_2014-03*',
file_metadata = '{"columns":["df_file_name","df_row_number"]}'
);
-- ============================================================================
-- QUERIES
-- ============================================================================
-- Full recursive scan
ASSERT ROW_COUNT = 73089
SELECT * FROM {{zone_name}}.parquet_demos.all_orders;
-- 14 distinct source files across 5 year directories
ASSERT VALUE file_count = 14
SELECT COUNT(DISTINCT df_file_name) AS file_count
FROM {{zone_name}}.parquet_demos.all_orders;
-- One year via file_filter
ASSERT ROW_COUNT = 23636
SELECT * FROM {{zone_name}}.parquet_demos.orders_2015;
-- Balanced sample: 100 rows × 14 files
ASSERT ROW_COUNT = 1400
SELECT * FROM {{zone_name}}.parquet_demos.orders_sample;
-- Single-quarter drill-down
ASSERT ROW_COUNT = 5210
SELECT * FROM {{zone_name}}.parquet_demos.orders_q1_2014;
-- Analytics with column pruning + aggregation
ASSERT ROW_COUNT = 10
ASSERT VALUE total_orders = 7481 WHERE "SalespersonPersonID" = 16
SELECT "SalespersonPersonID",
COUNT(*) AS total_orders,
COUNT(DISTINCT "CustomerID") AS unique_customers
FROM {{zone_name}}.parquet_demos.all_orders
GROUP BY "SalespersonPersonID"
ORDER BY total_orders DESC;
-- ============================================================================
-- CLEANUP
-- ============================================================================
DROP EXTERNAL TABLE IF EXISTS {{zone_name}}.parquet_demos.all_orders WITH FILES;
DROP EXTERNAL TABLE IF EXISTS {{zone_name}}.parquet_demos.orders_2015 WITH FILES;
DROP EXTERNAL TABLE IF EXISTS {{zone_name}}.parquet_demos.orders_sample WITH FILES;
DROP EXTERNAL TABLE IF EXISTS {{zone_name}}.parquet_demos.orders_q1_2014 WITH FILES;
DROP SCHEMA IF EXISTS {{zone_name}}.parquet_demos;
DROP ZONE IF EXISTS {{zone_name}};
## When to Use Use this demo when your Parquet landing zone is organised as year/quarter subdirectories (a very common supply-chain, finance, or e-commerce layout) and you need a single external table that sees every file while still being able to drill down by year, by quarter, or by a bounded sample. It is the canonical pattern for multi-file columnar analytics on DeltaForge. ## What You Will Learn 1. How to recursively scan nested directories with `recursive = 'true'` so one LOCATION covers many years of quarterly drops 2. How `row_group_filter = 'true'` activates predicate pushdown using Parquet's row-group min/max statistics 3. How `file_filter` globs pick a year (`Orders_2015*`) or a single quarter (`Orders_2014-03*`) from a recursive scan 4. How `max_rows` per file produces a fast, balanced sample across every file for data profiling 5. How `file_metadata` exposes `df_file_name` / `df_row_number` so every row keeps provenance back to its source Parquet 6. How Parquet's self-describing schema yields correct SQL types with zero schema declaration ## Prerequisites - DeltaForge GUI running with a workspace open - The `data/orders/` directory available at `{{data_path}}/orders` with year subdirectories `2012/`–`2016/` - Permission to CREATE ZONE and CREATE SCHEMA in the target workspace - Familiarity with quoted identifiers (Parquet here uses PascalCase column names like `"OrderID"`, `"SalespersonPersonID"`)