Read 90 insurance claims across three Avro files where v2 adds two new fields, validating automatic NULL-filling on v1 rows, file_filter, max_rows, and file_metadata.
-- ==========================================================================
-- SECTION 1: Zone & Schema
-- ==========================================================================
CREATE ZONE IF NOT EXISTS {{zone_name}} TYPE EXTERNAL
COMMENT 'External tables, demo datasets and file-backed data';
CREATE SCHEMA IF NOT EXISTS {{zone_name}}.avro_insurance
COMMENT 'Insurance claims Avro-backed external tables';
-- ==========================================================================
-- SECTION 2: Three external tables exercising evolution, filter, sampling
-- ==========================================================================
-- All 3 files merged: v1 rows get NULL for adjuster_name / settlement_date.
CREATE EXTERNAL TABLE IF NOT EXISTS {{zone_name}}.avro_insurance.all_claims
USING AVRO
LOCATION '{{data_path}}'
OPTIONS (
file_metadata = '{"columns":["df_file_name","df_row_number"]}'
);
GRANT ADMIN ON TABLE {{zone_name}}.avro_insurance.all_claims TO USER {{current_user}};
-- Auto claims only (v1 + v2 of the auto line = 60 rows).
CREATE EXTERNAL TABLE IF NOT EXISTS {{zone_name}}.avro_insurance.auto_claims_only
USING AVRO
LOCATION '{{data_path}}'
OPTIONS (
file_filter = '*auto*',
file_metadata = '{"columns":["df_file_name"]}'
);
GRANT ADMIN ON TABLE {{zone_name}}.avro_insurance.auto_claims_only TO USER {{current_user}};
-- Profiling sample: 15 rows per file, 3 files = 45 rows.
CREATE EXTERNAL TABLE IF NOT EXISTS {{zone_name}}.avro_insurance.sampled_claims
USING AVRO
LOCATION '{{data_path}}'
OPTIONS (
max_rows = '15',
file_metadata = '{"columns":["df_file_name"]}'
);
GRANT ADMIN ON TABLE {{zone_name}}.avro_insurance.sampled_claims TO USER {{current_user}};
-- ==========================================================================
-- SECTION 3: Representative queries with ASSERT guards
-- ==========================================================================
-- 3a. Full merged scan (90 rows across v1 + v2).
ASSERT ROW_COUNT = 90
SELECT * FROM {{zone_name}}.avro_insurance.all_claims;
-- 3b. Evolution: 60 v1 rows have NULL adjuster_name;
-- settlement_date is also NULL on Pending / Under Review v2 rows.
ASSERT VALUE null_adjuster = 60
ASSERT VALUE null_settlement = 74
SELECT COUNT(*) - COUNT(adjuster_name) AS null_adjuster,
COUNT(*) - COUNT(settlement_date) AS null_settlement
FROM {{zone_name}}.avro_insurance.all_claims;
-- 3c. file_filter picks up BOTH auto files (v1 + v2).
ASSERT ROW_COUNT = 60
SELECT * FROM {{zone_name}}.avro_insurance.auto_claims_only;
-- 3d. Claim-type breakdown (exact counts, safe on categorical strings).
ASSERT ROW_COUNT = 8
ASSERT VALUE claim_count = 16 WHERE claim_type = 'Collision'
SELECT claim_type,
COUNT(*) AS claim_count,
ROUND(AVG(amount_claimed),2) AS avg_claimed
FROM {{zone_name}}.avro_insurance.all_claims
GROUP BY claim_type
ORDER BY claim_type;
-- 3e. Status rollup, float SUMs use a tolerance band.
ASSERT VALUE claim_count = 24 WHERE status = 'Approved'
ASSERT WARNING VALUE sum_approved BETWEEN 502164.47 AND 502164.49
WHERE status = 'Approved'
ASSERT VALUE sum_approved = 0.0 WHERE status = 'Denied'
SELECT status,
COUNT(*) AS claim_count,
ROUND(SUM(amount_approved),2) AS sum_approved
FROM {{zone_name}}.avro_insurance.all_claims
GROUP BY status
ORDER BY status;
-- 3f. Profiling sample (15 per file x 3 files = 45).
ASSERT ROW_COUNT = 45
SELECT * FROM {{zone_name}}.avro_insurance.sampled_claims;
-- ==========================================================================
-- SECTION 4: Cleanup
-- ==========================================================================
DROP EXTERNAL TABLE IF EXISTS {{zone_name}}.avro_insurance.all_claims WITH FILES;
DROP EXTERNAL TABLE IF EXISTS {{zone_name}}.avro_insurance.auto_claims_only WITH FILES;
DROP EXTERNAL TABLE IF EXISTS {{zone_name}}.avro_insurance.sampled_claims WITH FILES;
DROP SCHEMA IF EXISTS {{zone_name}}.avro_insurance;
## When to Use Pick this demo when you need to prove that DeltaForge's Avro reader handles **producer-side schema evolution** correctly, i.e. when old files (v1) and new files (v2) coexist in the same directory, and queries must see a single unified schema with `NULL` filled in for missing fields. It is also a good reference for building analytics pipelines that mix approved (`double` amount), pending (no settlement date), and denied claims with mixed deflate/null compression codecs. ## What You Will Learn 1. How Avro's schema resolution rules surface when v1 (10 fields) and v2 (12 fields, adding `adjuster_name`, `settlement_date`) are read as one table, v1 rows get `NULL` for v2-only columns automatically. 2. How to reason about `COUNT(col)` vs `COUNT(*)` when some rows are NULL-filled: `COUNT(*) - COUNT(adjuster_name)` is the correct way to count missing values. 3. How `file_filter = '*auto*'` captures BOTH `claims_auto_v1.avro` and `claims_auto_v2.avro`, giving you a single schema-evolved view of one product line. 4. Why floating-point aggregates over Avro `double` fields should be guarded with `ASSERT WARNING VALUE ... BETWEEN ...` ranges, not exact equality, cross-engine and cross-order summation is not bit-exact. 5. How `max_rows = '15'` plus three files yields 45 rows, which is enough to profile data quality without a full scan. ## Prerequisites - DeltaForge with Avro reader enabled (default). - Read access to the demo `data/` directory containing `claims_auto_v1.avro`, `claims_home_v1.avro`, `claims_auto_v2.avro`. - Permission to create a zone and schema, or an existing `{{zone_name}}` you own.