Exercise ORC's NULL bitmap on a heavily-nullable patient dataset using COALESCE, NULLIF, CASE, and string predicates.
-- ================================================================
-- SETUP
-- ================================================================
CREATE ZONE IF NOT EXISTS external TYPE EXTERNAL
COMMENT 'External tables, demo datasets and file-backed data';
CREATE SCHEMA IF NOT EXISTS external.orc_trials
COMMENT 'ORC-backed clinical trial tables';
CREATE EXTERNAL TABLE IF NOT EXISTS external.orc_trials.patients
USING ORC
LOCATION '/data/clinical-trials/patients.orc';
-- ================================================================
-- QUERIES
-- ================================================================
-- NULL inventory across every nullable column in one scan.
SELECT COUNT(*) FILTER (WHERE followup_score IS NULL) AS null_followup,
COUNT(*) FILTER (WHERE adverse_event IS NULL) AS null_adverse,
COUNT(*) FILTER (WHERE notes IS NULL) AS null_notes,
COUNT(*) FILTER (WHERE bmi IS NULL) AS null_bmi,
COUNT(*) FILTER (WHERE gender IS NULL) AS null_gender
FROM external.orc_trials.patients;
-- COALESCE, fall back to baseline_score when followup is NULL (dropouts).
SELECT patient_id,
baseline_score,
followup_score,
COALESCE(followup_score, baseline_score) AS effective_score,
CASE WHEN followup_score IS NULL THEN 'Dropout' ELSE 'Completed' END AS status
FROM external.orc_trials.patients
ORDER BY patient_id;
-- CASE classification: Improved / Worsened / Dropout.
SELECT CASE
WHEN followup_score IS NULL THEN 'Dropout'
WHEN followup_score < baseline_score THEN 'Improved'
ELSE 'Worsened'
END AS outcome,
COUNT(*) AS patient_count
FROM external.orc_trials.patients
GROUP BY 1
ORDER BY patient_count DESC;
-- NULLIF, treat empty strings as missing content.
SELECT COUNT(*) FILTER (WHERE NULLIF(notes, '') IS NULL) AS effectively_empty,
COUNT(*) FILTER (WHERE NULLIF(notes, '') IS NOT NULL) AS has_content
FROM external.orc_trials.patients;
-- BMI buckets with NULL-safe label.
SELECT CASE
WHEN bmi IS NULL THEN 'Unknown'
WHEN bmi < 25.0 THEN 'Normal'
WHEN bmi < 30.0 THEN 'Overweight'
ELSE 'Obese'
END AS bmi_category,
COUNT(*) AS patient_count
FROM external.orc_trials.patients
GROUP BY 1
ORDER BY patient_count DESC;
-- LIKE filter on a nullable column: NULL rows silently drop out.
SELECT patient_id, trial_site, treatment_arm, notes
FROM external.orc_trials.patients
WHERE notes LIKE '%responded%'
ORDER BY patient_id;
-- ================================================================
-- CLEANUP
-- ================================================================
DROP EXTERNAL TABLE IF EXISTS external.orc_trials.patients WITH FILES;
DROP SCHEMA IF EXISTS external.orc_trials;
DROP ZONE IF EXISTS external;
## When to Use Use this pattern when your source ORC files carry sparse, optional fields, typical of clinical, survey, or CRM data, and you need dependable NULL semantics: fallback values, dropout classification, empty-string normalisation, and bucketed CASE output. It is the canonical way to prove that ORC's presence-bitmap encoding round-trips through DeltaForge without turning NULLs into empty strings or zeroes. ## What You Will Learn 1. How to enumerate NULL counts per column with COUNT(*) FILTER (WHERE col IS NULL), a DeltaForge idiom that replaces correlated scalar subqueries. 2. COALESCE for dropout fallback (effective_score = followup OR baseline) so a single scan yields an analysis-ready column. 3. NULLIF for treating empty strings as NULL, a frequent source of silent bugs when string data lands next to true NULLs in the same column. 4. Three-branch CASE for outcome classification (Improved / Worsened / Dropout) and NULL-safe bucket labels (BMI Unknown/Normal/Overweight/Obese). 5. LIKE-based filtering on a nullable text column: LIKE returns UNKNOWN for NULL, which behaves as false in WHERE, so NULLs are silently excluded. ## Prerequisites - A zone of TYPE EXTERNAL. - One ORC file at `{data_path}/patients.orc` with 150 rows and NULLs in followup_score, adverse_event, notes, bmi, gender. - No partitioning required, the demo exercises scalar NULL semantics, not pruning.