Join two ORC-backed external tables (policies, claims) and prove INNER/LEFT/anti-joins, EXISTS, and IN-subquery semantics across ORC data.
-- ================================================================
-- SETUP
-- ================================================================
CREATE ZONE IF NOT EXISTS external TYPE EXTERNAL
COMMENT 'External tables, demo datasets and file-backed data';
CREATE SCHEMA IF NOT EXISTS external.orc_insurance
COMMENT 'ORC-backed insurance tables';
CREATE EXTERNAL TABLE IF NOT EXISTS external.orc_insurance.policies
USING ORC
LOCATION '/data/insurance/policies.orc';
CREATE EXTERNAL TABLE IF NOT EXISTS external.orc_insurance.claims
USING ORC
LOCATION '/data/insurance/claims.orc';
-- ================================================================
-- QUERIES
-- ================================================================
-- INNER JOIN, matched claims (180 of 200).
SELECT c.claim_id, c.policy_id, c.claim_amount, c.status,
p.holder_name, p.policy_type, p.region
FROM external.orc_insurance.claims c
INNER JOIN external.orc_insurance.policies p
ON c.policy_id = p.policy_id;
-- LEFT JOIN + count orphans (claims whose policy_id is unknown).
SELECT COUNT(*) FILTER (WHERE p.policy_id IS NULL) AS orphan_count
FROM external.orc_insurance.claims c
LEFT JOIN external.orc_insurance.policies p
ON c.policy_id = p.policy_id;
-- Anti-join, policies with zero claims.
SELECT p.policy_id, p.holder_name, p.policy_type, p.annual_premium
FROM external.orc_insurance.policies p
LEFT JOIN external.orc_insurance.claims c
ON p.policy_id = c.policy_id
WHERE c.claim_id IS NULL
ORDER BY p.policy_id;
-- Aggregation after JOIN, claims grouped by policy type.
SELECT p.policy_type,
COUNT(*) AS claim_count,
ROUND(AVG(c.claim_amount), 2) AS avg_claim
FROM external.orc_insurance.claims c
INNER JOIN external.orc_insurance.policies p
ON c.policy_id = p.policy_id
GROUP BY p.policy_type
ORDER BY claim_count DESC;
-- EXISTS, policies that have at least one Denied claim.
SELECT p.policy_id, p.holder_name, p.policy_type
FROM external.orc_insurance.policies p
WHERE EXISTS (
SELECT 1 FROM external.orc_insurance.claims c
WHERE c.policy_id = p.policy_id AND c.status = 'Denied'
)
ORDER BY p.policy_id;
-- IN subquery, claims filed against Auto policies.
SELECT c.claim_id, c.policy_id, c.claim_amount, c.status
FROM external.orc_insurance.claims c
WHERE c.policy_id IN (
SELECT p.policy_id FROM external.orc_insurance.policies p
WHERE p.policy_type = 'Auto'
)
ORDER BY c.claim_id;
-- ================================================================
-- CLEANUP
-- ================================================================
DROP EXTERNAL TABLE IF EXISTS external.orc_insurance.claims WITH FILES;
DROP EXTERNAL TABLE IF EXISTS external.orc_insurance.policies WITH FILES;
DROP SCHEMA IF EXISTS external.orc_insurance;
DROP ZONE IF EXISTS external;
## When to Use Use this pattern when your ORC landing zone contains two or more related entities, policies + claims, customers + orders, devices + events, and you need to answer referential-integrity and cross-table analytical questions directly against the files, without first loading into Delta. It is the canonical way to verify that DeltaForge's join planner handles ORC inputs identically to Parquet. ## What You Will Learn 1. Creating two independent external tables, each pointing at a different ORC file, in the same schema. 2. INNER JOIN for matched rows only (180 of 200 claims reference a valid policy). 3. LEFT JOIN plus `WHERE right.pk IS NULL` as the SQL anti-join idiom, here, policies with zero claims (4 rows). 4. EXISTS vs IN: when to use each. EXISTS short-circuits per outer row and works with correlated subqueries; IN materialises the inner set and is planner-friendly for uncorrelated filters. 5. Counting orphan rows with `COUNT(*) FILTER (WHERE p.policy_id IS NULL)` after a LEFT JOIN, one scan, one number. ## Prerequisites - A zone of TYPE EXTERNAL. - Two ORC files: `policies.orc` (80 rows) and `claims.orc` (200 rows, 20 orphaned). - Both files live in the same data_path; the external tables reference them individually by name.