XML Subtree Capture, xml_paths & nested_output_format

Capture complex XML subtrees (specifications, supplier) as single string columns in JSON or raw XML form instead of flattening them into dozens of scalar columns.

Category: xml

Syntax

-- === SETUP ===
CREATE ZONE IF NOT EXISTS {{zone_name}} TYPE EXTERNAL
    COMMENT 'External tables, demo datasets and file-backed data';
CREATE SCHEMA IF NOT EXISTS {{zone_name}}.xml_demos
    COMMENT 'XML-backed external tables';

-- Table 1: subtrees captured as JSON strings
CREATE EXTERNAL TABLE IF NOT EXISTS {{zone_name}}.xml_demos.products_json
USING XML
LOCATION '{{data_path}}'
OPTIONS (
    xml_flatten_config = '{
        "row_xpath": "//product",
        "include_paths": [
            "/catalog/product/@id",
            "/catalog/product/@status",
            "/catalog/product/name",
            "/catalog/product/category",
            "/catalog/product/price",
            "/catalog/product/price/@currency",
            "/catalog/product/specifications",
            "/catalog/product/supplier",
            "/catalog/product/tags"
        ],
        "xml_paths": [
            "/catalog/product/specifications",
            "/catalog/product/supplier"
        ],
        "column_mappings": {
            "/catalog/product/@id":                 "catalog_product_attr_id",
            "/catalog/product/@status":             "catalog_product_attr_status",
            "/catalog/product/name":                "catalog_product_name",
            "/catalog/product/category":            "catalog_product_category",
            "/catalog/product/price":               "catalog_product_price",
            "/catalog/product/price/@currency":     "catalog_product_price_attr_currency",
            "/catalog/product/specifications":      "catalog_product_specifications",
            "/catalog/product/supplier":            "catalog_product_supplier",
            "/catalog/product/tags":                "catalog_product_tags"
        },
        "include_attributes": true,
        "separator": "_",
        "max_depth": 10,
        "nested_output_format": "json",
        "strip_namespace_prefixes": true
    }',
    file_metadata = '{"columns":["df_file_name","df_row_number"]}'
);
GRANT ADMIN ON TABLE {{zone_name}}.xml_demos.products_json TO USER {{current_user}};

-- Table 2: same subtrees captured as raw XML fragments
CREATE EXTERNAL TABLE IF NOT EXISTS {{zone_name}}.xml_demos.products_xml
USING XML
LOCATION '{{data_path}}'
OPTIONS (
    xml_flatten_config = '{
        "row_xpath": "//product",
        "include_paths": [
            "/catalog/product/@id",
            "/catalog/product/@status",
            "/catalog/product/name",
            "/catalog/product/category",
            "/catalog/product/price",
            "/catalog/product/price/@currency",
            "/catalog/product/specifications",
            "/catalog/product/supplier",
            "/catalog/product/tags"
        ],
        "xml_paths": [
            "/catalog/product/specifications",
            "/catalog/product/supplier"
        ],
        "column_mappings": {
            "/catalog/product/@id":                 "catalog_product_attr_id",
            "/catalog/product/@status":             "catalog_product_attr_status",
            "/catalog/product/name":                "catalog_product_name",
            "/catalog/product/category":            "catalog_product_category",
            "/catalog/product/price":               "catalog_product_price",
            "/catalog/product/price/@currency":     "catalog_product_price_attr_currency",
            "/catalog/product/specifications":      "catalog_product_specifications",
            "/catalog/product/supplier":            "catalog_product_supplier",
            "/catalog/product/tags":                "catalog_product_tags"
        },
        "include_attributes": true,
        "separator": "_",
        "max_depth": 10,
        "nested_output_format": "xml",
        "strip_namespace_prefixes": true
    }',
    file_metadata = '{"columns":["df_file_name","df_row_number"]}'
);
GRANT ADMIN ON TABLE {{zone_name}}.xml_demos.products_xml TO USER {{current_user}};

-- === QUERIES ===
-- 1. Row counts match across both tables
ASSERT ROW_COUNT = 5
SELECT * FROM {{zone_name}}.xml_demos.products_json;

ASSERT ROW_COUNT = 5
SELECT * FROM {{zone_name}}.xml_demos.products_xml;

-- 2. Top-level fields stay flat even when subtrees are captured whole
ASSERT VALUE catalog_product_name                  = 'Industrial Sensor Module' WHERE catalog_product_attr_id = 'PRD-001'
ASSERT VALUE catalog_product_category              = 'Electronics'              WHERE catalog_product_attr_id = 'PRD-001'
ASSERT VALUE catalog_product_price_attr_currency   = 'USD'                      WHERE catalog_product_attr_id = 'PRD-001'
ASSERT VALUE catalog_product_attr_status           = 'discontinued'             WHERE catalog_product_attr_id = 'PRD-003'
ASSERT VALUE catalog_product_price_attr_currency   = 'EUR'                      WHERE catalog_product_attr_id = 'PRD-004'
SELECT catalog_product_attr_id, catalog_product_name, catalog_product_category,
       catalog_product_price, catalog_product_price_attr_currency,
       catalog_product_specifications, catalog_product_supplier
FROM {{zone_name}}.xml_demos.products_json
ORDER BY catalog_product_attr_id;

-- 3. JSON format detection: captured subtree starts with '{'
ASSERT VALUE specs_json_count = 5
SELECT COUNT(*) FILTER (WHERE catalog_product_specifications LIKE '{%') AS specs_json_count
FROM {{zone_name}}.xml_demos.products_json;

-- 4. XML format detection: captured subtree starts with '<'
ASSERT VALUE specs_xml_count = 5
SELECT COUNT(*) FILTER (WHERE catalog_product_specifications LIKE '<%') AS specs_xml_count
FROM {{zone_name}}.xml_demos.products_xml;

-- 5. Supplier company names present in both captures
ASSERT VALUE supplier_json_count = 5
SELECT COUNT(*) FILTER (
    WHERE catalog_product_supplier LIKE '%TechParts%'
       OR catalog_product_supplier LIKE '%NetCore%'
       OR catalog_product_supplier LIKE '%EuroPower%'
) AS supplier_json_count
FROM {{zone_name}}.xml_demos.products_json;

-- 6. Side-by-side JSON vs XML capture for a single product
ASSERT ROW_COUNT = 1
SELECT j.catalog_product_attr_id,
       j.catalog_product_name,
       j.catalog_product_specifications AS specs_as_json,
       x.catalog_product_specifications AS specs_as_xml
FROM {{zone_name}}.xml_demos.products_json j
JOIN {{zone_name}}.xml_demos.products_xml  x ON j.catalog_product_attr_id = x.catalog_product_attr_id
WHERE j.catalog_product_attr_id = 'PRD-002';

-- === CLEANUP ===
DROP EXTERNAL TABLE IF EXISTS {{zone_name}}.xml_demos.products_json WITH FILES;
DROP EXTERNAL TABLE IF EXISTS {{zone_name}}.xml_demos.products_xml  WITH FILES;
DROP SCHEMA IF EXISTS {{zone_name}}.xml_demos;
DROP ZONE IF EXISTS {{zone_name}};

Description

## When to Use Reach for `xml_paths` whenever a subtree is too deep, too variable, or too domain-specific to flatten cleanly, product specifications, legal clauses, supplier blocks, HL7 OBX segments, or anything you want to hand off to a downstream JSON parser or XPath evaluator. The top-level fields you care about for joins and filters stay flat and typed, while the messy inner structure is preserved verbatim. ## What You Will Learn 1. How `xml_paths` designates subtrees to capture as strings rather than flatten, the same flatten config handles top-level fields normally. 2. How `nested_output_format: "json"` serialises the captured subtree as a JSON object with underscore-joined keys, ready for `json_extract` or `LIKE '%...%'` probes. 3. How `nested_output_format: "xml"` emits a raw XML fragment string that preserves element names, attributes, and nesting, useful when you need to round-trip or feed an XPath library. 4. How multiple subtree captures coexist in one row (both `specifications` and `supplier` are captured per product). 5. How `include_paths` plus `xml_paths` plus `column_mappings` compose: captured paths are renamed just like any other column. 6. How a single `LOCATION` backs two tables, one JSON-serialised, one XML-serialised, so downstream consumers can pick their preferred format. 7. How to ASSERT on the captured string with cheap `LIKE '{%'` / `LIKE '<%'` checks that detect regressions in the serialiser without needing a JSON/XML parser in the assertion engine. ## Prerequisites An external zone, product catalog XML files with `<catalog><product id="..."><specifications>...</specifications><supplier>...</supplier></product></catalog>` shape (the demo ships two), and `ADMIN` on the target schema. You should already be comfortable with basic XML flattening before you pick which subtrees to capture whole.

Pitfalls

Open in interactive docs →   DeltaForge home →