Capture complex XML subtrees (specifications, supplier) as single string columns in JSON or raw XML form instead of flattening them into dozens of scalar columns.
-- === SETUP ===
CREATE ZONE IF NOT EXISTS {{zone_name}} TYPE EXTERNAL
COMMENT 'External tables, demo datasets and file-backed data';
CREATE SCHEMA IF NOT EXISTS {{zone_name}}.xml_demos
COMMENT 'XML-backed external tables';
-- Table 1: subtrees captured as JSON strings
CREATE EXTERNAL TABLE IF NOT EXISTS {{zone_name}}.xml_demos.products_json
USING XML
LOCATION '{{data_path}}'
OPTIONS (
xml_flatten_config = '{
"row_xpath": "//product",
"include_paths": [
"/catalog/product/@id",
"/catalog/product/@status",
"/catalog/product/name",
"/catalog/product/category",
"/catalog/product/price",
"/catalog/product/price/@currency",
"/catalog/product/specifications",
"/catalog/product/supplier",
"/catalog/product/tags"
],
"xml_paths": [
"/catalog/product/specifications",
"/catalog/product/supplier"
],
"column_mappings": {
"/catalog/product/@id": "catalog_product_attr_id",
"/catalog/product/@status": "catalog_product_attr_status",
"/catalog/product/name": "catalog_product_name",
"/catalog/product/category": "catalog_product_category",
"/catalog/product/price": "catalog_product_price",
"/catalog/product/price/@currency": "catalog_product_price_attr_currency",
"/catalog/product/specifications": "catalog_product_specifications",
"/catalog/product/supplier": "catalog_product_supplier",
"/catalog/product/tags": "catalog_product_tags"
},
"include_attributes": true,
"separator": "_",
"max_depth": 10,
"nested_output_format": "json",
"strip_namespace_prefixes": true
}',
file_metadata = '{"columns":["df_file_name","df_row_number"]}'
);
GRANT ADMIN ON TABLE {{zone_name}}.xml_demos.products_json TO USER {{current_user}};
-- Table 2: same subtrees captured as raw XML fragments
CREATE EXTERNAL TABLE IF NOT EXISTS {{zone_name}}.xml_demos.products_xml
USING XML
LOCATION '{{data_path}}'
OPTIONS (
xml_flatten_config = '{
"row_xpath": "//product",
"include_paths": [
"/catalog/product/@id",
"/catalog/product/@status",
"/catalog/product/name",
"/catalog/product/category",
"/catalog/product/price",
"/catalog/product/price/@currency",
"/catalog/product/specifications",
"/catalog/product/supplier",
"/catalog/product/tags"
],
"xml_paths": [
"/catalog/product/specifications",
"/catalog/product/supplier"
],
"column_mappings": {
"/catalog/product/@id": "catalog_product_attr_id",
"/catalog/product/@status": "catalog_product_attr_status",
"/catalog/product/name": "catalog_product_name",
"/catalog/product/category": "catalog_product_category",
"/catalog/product/price": "catalog_product_price",
"/catalog/product/price/@currency": "catalog_product_price_attr_currency",
"/catalog/product/specifications": "catalog_product_specifications",
"/catalog/product/supplier": "catalog_product_supplier",
"/catalog/product/tags": "catalog_product_tags"
},
"include_attributes": true,
"separator": "_",
"max_depth": 10,
"nested_output_format": "xml",
"strip_namespace_prefixes": true
}',
file_metadata = '{"columns":["df_file_name","df_row_number"]}'
);
GRANT ADMIN ON TABLE {{zone_name}}.xml_demos.products_xml TO USER {{current_user}};
-- === QUERIES ===
-- 1. Row counts match across both tables
ASSERT ROW_COUNT = 5
SELECT * FROM {{zone_name}}.xml_demos.products_json;
ASSERT ROW_COUNT = 5
SELECT * FROM {{zone_name}}.xml_demos.products_xml;
-- 2. Top-level fields stay flat even when subtrees are captured whole
ASSERT VALUE catalog_product_name = 'Industrial Sensor Module' WHERE catalog_product_attr_id = 'PRD-001'
ASSERT VALUE catalog_product_category = 'Electronics' WHERE catalog_product_attr_id = 'PRD-001'
ASSERT VALUE catalog_product_price_attr_currency = 'USD' WHERE catalog_product_attr_id = 'PRD-001'
ASSERT VALUE catalog_product_attr_status = 'discontinued' WHERE catalog_product_attr_id = 'PRD-003'
ASSERT VALUE catalog_product_price_attr_currency = 'EUR' WHERE catalog_product_attr_id = 'PRD-004'
SELECT catalog_product_attr_id, catalog_product_name, catalog_product_category,
catalog_product_price, catalog_product_price_attr_currency,
catalog_product_specifications, catalog_product_supplier
FROM {{zone_name}}.xml_demos.products_json
ORDER BY catalog_product_attr_id;
-- 3. JSON format detection: captured subtree starts with '{'
ASSERT VALUE specs_json_count = 5
SELECT COUNT(*) FILTER (WHERE catalog_product_specifications LIKE '{%') AS specs_json_count
FROM {{zone_name}}.xml_demos.products_json;
-- 4. XML format detection: captured subtree starts with '<'
ASSERT VALUE specs_xml_count = 5
SELECT COUNT(*) FILTER (WHERE catalog_product_specifications LIKE '<%') AS specs_xml_count
FROM {{zone_name}}.xml_demos.products_xml;
-- 5. Supplier company names present in both captures
ASSERT VALUE supplier_json_count = 5
SELECT COUNT(*) FILTER (
WHERE catalog_product_supplier LIKE '%TechParts%'
OR catalog_product_supplier LIKE '%NetCore%'
OR catalog_product_supplier LIKE '%EuroPower%'
) AS supplier_json_count
FROM {{zone_name}}.xml_demos.products_json;
-- 6. Side-by-side JSON vs XML capture for a single product
ASSERT ROW_COUNT = 1
SELECT j.catalog_product_attr_id,
j.catalog_product_name,
j.catalog_product_specifications AS specs_as_json,
x.catalog_product_specifications AS specs_as_xml
FROM {{zone_name}}.xml_demos.products_json j
JOIN {{zone_name}}.xml_demos.products_xml x ON j.catalog_product_attr_id = x.catalog_product_attr_id
WHERE j.catalog_product_attr_id = 'PRD-002';
-- === CLEANUP ===
DROP EXTERNAL TABLE IF EXISTS {{zone_name}}.xml_demos.products_json WITH FILES;
DROP EXTERNAL TABLE IF EXISTS {{zone_name}}.xml_demos.products_xml WITH FILES;
DROP SCHEMA IF EXISTS {{zone_name}}.xml_demos;
DROP ZONE IF EXISTS {{zone_name}};
## When to Use Reach for `xml_paths` whenever a subtree is too deep, too variable, or too domain-specific to flatten cleanly, product specifications, legal clauses, supplier blocks, HL7 OBX segments, or anything you want to hand off to a downstream JSON parser or XPath evaluator. The top-level fields you care about for joins and filters stay flat and typed, while the messy inner structure is preserved verbatim. ## What You Will Learn 1. How `xml_paths` designates subtrees to capture as strings rather than flatten, the same flatten config handles top-level fields normally. 2. How `nested_output_format: "json"` serialises the captured subtree as a JSON object with underscore-joined keys, ready for `json_extract` or `LIKE '%...%'` probes. 3. How `nested_output_format: "xml"` emits a raw XML fragment string that preserves element names, attributes, and nesting, useful when you need to round-trip or feed an XPath library. 4. How multiple subtree captures coexist in one row (both `specifications` and `supplier` are captured per product). 5. How `include_paths` plus `xml_paths` plus `column_mappings` compose: captured paths are renamed just like any other column. 6. How a single `LOCATION` backs two tables, one JSON-serialised, one XML-serialised, so downstream consumers can pick their preferred format. 7. How to ASSERT on the captured string with cheap `LIKE '{%'` / `LIKE '<%'` checks that detect regressions in the serialiser without needing a JSON/XML parser in the assertion engine. ## Prerequisites An external zone, product catalog XML files with `<catalog><product id="..."><specifications>...</specifications><supplier>...</supplier></product></catalog>` shape (the demo ships two), and `ADMIN` on the target schema. You should already be comfortable with basic XML flattening before you pick which subtrees to capture whole.