Union 5 yearly XML catalog exports (2000-2004) into one external table, with added/dropped elements becoming NULL in files that lack them.
-- === SETUP ===
CREATE ZONE IF NOT EXISTS {{zone_name}} TYPE EXTERNAL
COMMENT 'External tables, demo datasets and file-backed data';
CREATE SCHEMA IF NOT EXISTS {{zone_name}}.xml_demos
COMMENT 'XML-backed external tables';
-- One external table reads all 5 files. The xml_flatten_config declares the
-- union of every path across every vintage; missing paths produce NULL.
CREATE EXTERNAL TABLE IF NOT EXISTS {{zone_name}}.xml_demos.books_evolved
USING XML
LOCATION '{{data_path}}'
OPTIONS (
xml_flatten_config = '{
"row_xpath": "//book",
"include_paths": [
"/catalog/book/@id",
"/catalog/book/@format",
"/catalog/book/author",
"/catalog/book/title",
"/catalog/book/genre",
"/catalog/book/price",
"/catalog/book/publish_date",
"/catalog/book/description",
"/catalog/book/isbn",
"/catalog/book/language",
"/catalog/book/publisher",
"/catalog/book/rating",
"/catalog/book/edition",
"/catalog/book/pages",
"/catalog/book/series"
],
"column_mappings": {
"/catalog/book/@id": "catalog_book_attr_id",
"/catalog/book/@format": "catalog_book_attr_format",
"/catalog/book/author": "catalog_book_author",
"/catalog/book/title": "catalog_book_title",
"/catalog/book/genre": "catalog_book_genre",
"/catalog/book/price": "catalog_book_price",
"/catalog/book/publish_date": "catalog_book_publish_date",
"/catalog/book/description": "catalog_book_description",
"/catalog/book/isbn": "catalog_book_isbn",
"/catalog/book/language": "catalog_book_language",
"/catalog/book/publisher": "catalog_book_publisher",
"/catalog/book/rating": "catalog_book_rating",
"/catalog/book/edition": "catalog_book_edition",
"/catalog/book/pages": "catalog_book_pages",
"/catalog/book/series": "catalog_book_series"
},
"include_attributes": true,
"separator": "_",
"max_depth": 10,
"strip_namespace_prefixes": true
}',
file_metadata = '{"columns":["df_file_name","df_file_modified","df_dataset","df_row_number"]}'
);
GRANT ADMIN ON TABLE {{zone_name}}.xml_demos.books_evolved TO USER {{current_user}};
-- === QUERIES ===
-- 1. Total row count across every vintage
ASSERT ROW_COUNT = 15
SELECT * FROM {{zone_name}}.xml_demos.books_evolved;
-- 2. NULL counts prove which waves introduced each column
ASSERT VALUE isbn_null_count = 3 -- added in file 2 (wave 1 NULL)
ASSERT VALUE publisher_null_count = 6 -- added in file 3 (waves 1-2 NULL)
ASSERT VALUE description_null_count = 6 -- dropped in file 4 (waves 4-5 NULL)
ASSERT VALUE edition_null_count = 9 -- added in file 4 (waves 1-3 NULL)
ASSERT VALUE series_null_count = 12 -- added in file 5 (waves 1-4 NULL)
ASSERT VALUE attr_format_null_count = 12 -- attribute added in file 5
SELECT
COUNT(*) FILTER (WHERE catalog_book_isbn IS NULL) AS isbn_null_count,
COUNT(*) FILTER (WHERE catalog_book_publisher IS NULL) AS publisher_null_count,
COUNT(*) FILTER (WHERE catalog_book_description IS NULL) AS description_null_count,
COUNT(*) FILTER (WHERE catalog_book_edition IS NULL) AS edition_null_count,
COUNT(*) FILTER (WHERE catalog_book_series IS NULL) AS series_null_count,
COUNT(*) FILTER (WHERE catalog_book_attr_format IS NULL) AS attr_format_null_count
FROM {{zone_name}}.xml_demos.books_evolved;
-- 3. Spot check individual vintages
ASSERT VALUE catalog_book_author = 'Gambardella, Matthew' WHERE catalog_book_attr_id = 'bk101'
ASSERT VALUE catalog_book_attr_format = 'hardcover' WHERE catalog_book_attr_id = 'bk113'
ASSERT VALUE catalog_book_series = 'Maeve Saga' WHERE catalog_book_attr_id = 'bk113'
SELECT catalog_book_attr_id, catalog_book_title, catalog_book_attr_format, catalog_book_series
FROM {{zone_name}}.xml_demos.books_evolved
WHERE catalog_book_attr_id IN ('bk101','bk108','bk113')
ORDER BY catalog_book_attr_id;
-- 4. Genre distribution across the full corpus
ASSERT VALUE book_count = 5 WHERE catalog_book_genre = 'Fantasy'
ASSERT VALUE book_count = 4 WHERE catalog_book_genre = 'Computer'
SELECT catalog_book_genre, COUNT(*) AS book_count
FROM {{zone_name}}.xml_demos.books_evolved
GROUP BY catalog_book_genre
ORDER BY book_count DESC;
-- === CLEANUP ===
DROP EXTERNAL TABLE IF EXISTS {{zone_name}}.xml_demos.books_evolved WITH FILES;
DROP SCHEMA IF EXISTS {{zone_name}}.xml_demos;
DROP ZONE IF EXISTS {{zone_name}};
## When to Use Reach for this pattern whenever an XML feed evolves over time, new elements appear, old ones are retired, and new attributes are introduced, and you need a single queryable table that spans every vintage. The engine saves a pre-declared `xml_flatten_config` to the catalog and uses it on every read, so the union schema is stable even if new files arrive with extra or missing elements. ## What You Will Learn 1. How to declare a union schema up-front via `include_paths` and `column_mappings` so schema discovery does not have to sniff every file. 2. How `//book` as `row_xpath` turns each `<book>` element into a row regardless of parent structure. 3. Why attributes are extracted as dedicated columns with the `attr_` segment in their mapped names (for example `/catalog/book/@id` becomes `catalog_book_attr_id`). 4. How DeltaForge fills missing elements with SQL `NULL` instead of failing, files 1-3 have no `isbn`, yet the unified table has an `isbn` column that is simply `NULL` for those rows. 5. How to verify schema evolution with `ASSERT VALUE ... IS NULL` and `COUNT(*) FILTER (WHERE col IS NULL)` checks that pin down exactly which rows lack which columns. 6. How `strip_namespace_prefixes: true` keeps column names clean even if later files add namespaced variants. 7. How to use `df_file_name` file-metadata to trace a row back to its source XML. ## Prerequisites An external zone you can write to, five XML files of the form `<catalog><book id="...">...</book></catalog>` (the demo provides `01_catalog_2000.xml` through `05_catalog_2004.xml`), and `ADMIN` on the schema where the table will be created. The demo substitutes `{{zone_name}}`, `{{data_path}}`, and `{{current_user}}` at runtime.