Read 2,500 sensor readings from 5 building-floor Avro files with schema evolution (v1→v2), mixed codecs, file_filter, max_rows, and file_metadata-driven row accounting.
-- ==========================================================================
-- SECTION 1: Zone & Schema
-- ==========================================================================
CREATE ZONE IF NOT EXISTS {{zone_name}} TYPE EXTERNAL
COMMENT 'External tables, demo datasets and file-backed data';
CREATE SCHEMA IF NOT EXISTS {{zone_name}}.avro_demos
COMMENT 'Avro-backed external tables';
-- ==========================================================================
-- SECTION 2: Three external tables over 5 per-floor Avro files
-- ==========================================================================
-- All 5 files. Union schema = v1 ∪ v2; v1 rows get NULL for v2-only cols.
CREATE EXTERNAL TABLE IF NOT EXISTS {{zone_name}}.avro_demos.all_readings
USING AVRO
LOCATION '{{data_path}}'
OPTIONS (
file_metadata = '{"columns":["df_file_name","df_row_number"]}'
);
GRANT ADMIN ON TABLE {{zone_name}}.avro_demos.all_readings TO USER {{current_user}};
-- Single-file filter to surface the full v2 schema (battery_pct, firmware_version).
CREATE EXTERNAL TABLE IF NOT EXISTS {{zone_name}}.avro_demos.floor4_only
USING AVRO
LOCATION '{{data_path}}'
OPTIONS (
file_filter = 'floor4*',
file_metadata = '{"columns":["df_file_name","df_row_number"]}'
);
GRANT ADMIN ON TABLE {{zone_name}}.avro_demos.floor4_only TO USER {{current_user}};
-- Per-file sampling: 50 rows x 5 files = 250.
CREATE EXTERNAL TABLE IF NOT EXISTS {{zone_name}}.avro_demos.readings_sample
USING AVRO
LOCATION '{{data_path}}'
OPTIONS (
max_rows = '50',
file_metadata = '{"columns":["df_file_name","df_row_number"]}'
);
GRANT ADMIN ON TABLE {{zone_name}}.avro_demos.readings_sample TO USER {{current_user}};
-- ==========================================================================
-- SECTION 3: Representative queries with ASSERT guards
-- ==========================================================================
-- 3a. Full scan (2,500 rows).
ASSERT ROW_COUNT = 2500
SELECT * FROM {{zone_name}}.avro_demos.all_readings;
-- 3b. Per-file row accounting via df_file_name.
ASSERT ROW_COUNT = 5
ASSERT VALUE row_count = 500 WHERE df_file_name LIKE '%floor1%'
ASSERT VALUE row_count = 500 WHERE df_file_name LIKE '%floor4%'
SELECT df_file_name, COUNT(*) AS row_count
FROM {{zone_name}}.avro_demos.all_readings
GROUP BY df_file_name
ORDER BY df_file_name;
-- 3c. V2-only columns via file_filter.
ASSERT VALUE battery_non_null = 500
SELECT COUNT(battery_pct) AS battery_non_null
FROM {{zone_name}}.avro_demos.floor4_only;
ASSERT VALUE firmware_non_null = 500
SELECT COUNT(firmware_version) AS firmware_non_null
FROM {{zone_name}}.avro_demos.floor4_only;
-- 3d. Per-file sampling with max_rows (50 x 5 = 250).
ASSERT ROW_COUNT = 250
SELECT * FROM {{zone_name}}.avro_demos.readings_sample;
-- 3e. Floor/zone analytics (25 combinations = 5 floors x 5 zones).
ASSERT ROW_COUNT = 25
SELECT floor, zone,
ROUND(AVG(CAST(temperature_c AS DOUBLE)), 1) AS avg_temp_c,
ROUND(AVG(CAST(humidity_pct AS DOUBLE)), 1) AS avg_humidity,
ROUND(AVG(CAST(co2_ppm AS DOUBLE)), 0) AS avg_co2
FROM {{zone_name}}.avro_demos.all_readings
GROUP BY floor, zone
ORDER BY floor, zone;
-- 3f. Occupancy rate by floor.
ASSERT ROW_COUNT = 5
ASSERT VALUE occupied_readings = 364 WHERE floor = 1
ASSERT VALUE occupancy_pct = 72.8 WHERE floor = 1
SELECT floor,
COUNT(*) AS total_readings,
SUM(CASE WHEN CAST(occupancy AS BOOLEAN) THEN 1 ELSE 0 END) AS occupied_readings,
ROUND(SUM(CASE WHEN CAST(occupancy AS BOOLEAN) THEN 1 ELSE 0 END)
* 100.0 / COUNT(*), 1) AS occupancy_pct
FROM {{zone_name}}.avro_demos.all_readings
GROUP BY floor
ORDER BY floor;
-- ==========================================================================
-- SECTION 4: Cleanup
-- ==========================================================================
DROP EXTERNAL TABLE IF EXISTS {{zone_name}}.avro_demos.all_readings WITH FILES;
DROP EXTERNAL TABLE IF EXISTS {{zone_name}}.avro_demos.floor4_only WITH FILES;
DROP EXTERNAL TABLE IF EXISTS {{zone_name}}.avro_demos.readings_sample WITH FILES;
DROP SCHEMA IF EXISTS {{zone_name}}.avro_demos;
## When to Use Choose this demo when you want a realistic, larger-scale (2,500 rows) reference for Avro in streaming/IoT contexts: per-device or per-partition files, one or two schema upgrades rolling out per fleet, and cheap per-block deflate compression. It is also the best example in the Avro suite for exercising `file_metadata`, every row is traceable back to its source file and row number, which is essential when auditing telemetry pipelines. ## What You Will Learn 1. How Avro's self-describing file headers eliminate the need for a schema-on-read declaration: `CREATE EXTERNAL TABLE ... USING AVRO` is all you need. 2. How schema evolution across 5 files (v1 on floors 1–3, v2 on floors 4–5 with extra `battery_pct`, `firmware_version`) produces a single union schema with NULL-filling for older rows. 3. How to use `file_filter = 'floor4*'` to promote a v2 file to the top of the schema, giving you access to all v2 columns with no NULLs. 4. How `df_file_name` and `df_row_number` system columns let you write aggregate checks like `SELECT df_file_name, COUNT(*) ... GROUP BY df_file_name` to verify per-file ingestion. 5. How to pattern multi-check verification using `UNION ALL` and `CASE WHEN ... THEN 'PASS' ELSE 'FAIL' END`, a reusable smoke-test template for any multi-file ingestion. ## Prerequisites - DeltaForge with Avro reader enabled (default). - Read access to the demo `data/` directory containing `floor1_sensors.avro` … `floor5_sensors.avro`. - Permission to create a zone and schema, or an existing `{{zone_name}}` you own.