JSON Subtree Capture — json_paths

Capture complex nested subtrees as serialized JSON string columns with json_paths, and contrast against full flattening.

Category: json

Syntax

-- ============================================================================
-- Setup: two external tables over the same 5 property listings
-- ============================================================================
CREATE ZONE IF NOT EXISTS {{zone_name}} TYPE EXTERNAL
    COMMENT 'External tables — demo datasets and file-backed data';

CREATE SCHEMA IF NOT EXISTS {{zone_name}}.json_demos
    COMMENT 'JSON-backed external tables';

-- Captured: location and pricing kept as JSON strings
CREATE EXTERNAL TABLE IF NOT EXISTS {{zone_name}}.json_demos.listings_captured
USING JSON
LOCATION '{{data_path}}'
OPTIONS (
    json_flatten_config = '{
        "root_path": "$",
        "include_paths": [
            "$.id", "$.title", "$.type",
            "$.bedrooms", "$.bathrooms", "$.sqft",
            "$.year_built", "$.status",
            "$.location", "$.pricing", "$.tags"
        ],
        "json_paths": ["$.location", "$.pricing"],
        "column_mappings": {
            "$.id": "id",
            "$.type": "type",
            "$.year_built": "year_built",
            "$.location": "location",
            "$.pricing":  "pricing"
        },
        "max_depth": 10,
        "separator": "_",
        "default_array_handling": "to_json",
        "infer_types": true
    }',
    file_metadata = '{"columns":["df_file_name","df_row_number"]}'
);
GRANT ADMIN ON TABLE {{zone_name}}.json_demos.listings_captured TO USER {{current_user}};

-- Flattened: same subtrees expanded into individual columns
CREATE EXTERNAL TABLE IF NOT EXISTS {{zone_name}}.json_demos.listings_flattened
USING JSON
LOCATION '{{data_path}}'
OPTIONS (
    json_flatten_config = '{
        "root_path": "$",
        "include_paths": [
            "$.id", "$.title", "$.type",
            "$.bedrooms", "$.bathrooms", "$.sqft",
            "$.year_built", "$.status",
            "$.location.address.street", "$.location.address.unit",
            "$.location.address.city",   "$.location.address.state",
            "$.location.address.zip",
            "$.location.geo.lat", "$.location.geo.lng",
            "$.location.neighborhood", "$.location.walk_score",
            "$.pricing.list_price", "$.pricing.price_per_sqft",
            "$.pricing.hoa_monthly", "$.pricing.tax_annual",
            "$.pricing.mortgage_estimate.monthly_payment",
            "$.tags"
        ],
        "column_mappings": {
            "$.id": "id",
            "$.type": "type",
            "$.location.address.street":     "location_address_street",
            "$.location.address.city":       "location_address_city",
            "$.location.address.state":      "location_address_state",
            "$.location.address.zip":        "location_address_zip",
            "$.location.geo.lat":            "location_geo_lat",
            "$.location.geo.lng":            "location_geo_lng",
            "$.location.neighborhood":       "location_neighborhood",
            "$.location.walk_score":         "location_walk_score",
            "$.pricing.list_price":          "pricing_list_price",
            "$.pricing.price_per_sqft":      "pricing_price_per_sqft",
            "$.pricing.hoa_monthly":         "pricing_hoa_monthly",
            "$.pricing.tax_annual":          "pricing_tax_annual",
            "$.pricing.mortgage_estimate.monthly_payment": "pricing_mortgage_estimate_monthly_payment"
        },
        "max_depth": 10,
        "separator": "_",
        "default_array_handling": "to_json",
        "infer_types": true
    }',
    file_metadata = '{"columns":["df_file_name","df_row_number"]}'
);
GRANT ADMIN ON TABLE {{zone_name}}.json_demos.listings_flattened TO USER {{current_user}};

-- ============================================================================
-- Queries: captured vs flattened
-- ============================================================================
-- Both tables see the same 5 listings
ASSERT ROW_COUNT = 5
SELECT * FROM {{zone_name}}.json_demos.listings_captured;

ASSERT ROW_COUNT = 5
SELECT * FROM {{zone_name}}.json_demos.listings_flattened;

-- Every captured row is a JSON object (starts with '{')
ASSERT VALUE json_location_count = 5
SELECT COUNT(*) FILTER (WHERE location LIKE '{%') AS json_location_count
FROM {{zone_name}}.json_demos.listings_captured;

ASSERT VALUE json_pricing_count = 5
SELECT COUNT(*) FILTER (WHERE pricing LIKE '{%') AS json_pricing_count
FROM {{zone_name}}.json_demos.listings_captured;

-- Flattened table lets you filter on nested fields directly
ASSERT ROW_COUNT = 5
ASSERT VALUE location_address_city  = 'San Francisco' WHERE id = 'LST-001'
ASSERT VALUE location_neighborhood  = 'Hawthorne'     WHERE id = 'LST-002'
ASSERT VALUE pricing_list_price     = 4750000         WHERE id = 'LST-005'
SELECT id, title, type, bedrooms,
       location_address_city, location_address_state, location_neighborhood,
       pricing_list_price, pricing_tax_annual,
       pricing_mortgage_estimate_monthly_payment
FROM {{zone_name}}.json_demos.listings_flattened
ORDER BY id;

-- Side-by-side: JOIN captured and flattened on id
ASSERT ROW_COUNT = 1
ASSERT VALUE location_address_city = 'Boston'
ASSERT VALUE pricing_mortgage_estimate_monthly_payment = 27730
SELECT c.id, c.title, c.location,
       f.location_address_city, f.location_address_state,
       f.pricing_list_price, f.pricing_mortgage_estimate_monthly_payment
FROM {{zone_name}}.json_demos.listings_captured  c
JOIN {{zone_name}}.json_demos.listings_flattened f ON c.id = f.id
WHERE c.id = 'LST-005';

-- ============================================================================
-- Cleanup
-- ============================================================================
DROP EXTERNAL TABLE IF EXISTS {{zone_name}}.json_demos.listings_captured  WITH FILES;
DROP EXTERNAL TABLE IF EXISTS {{zone_name}}.json_demos.listings_flattened WITH FILES;
DROP SCHEMA IF EXISTS {{zone_name}}.json_demos;
DROP ZONE   IF EXISTS {{zone_name}};

Description

## When to Use Use this demo when a JSON document has subtrees you want to preserve verbatim — typically for downstream systems that re-serialize the JSON (map renderers, mortgage calculators, API responses) — but you still want the top-level scalar fields as proper columns. `json_paths` tells DeltaForge "stop flattening here, hand me the whole subtree as a JSON string." The demo builds two tables from the same 5 property listings: `listings_captured` (location + pricing kept as JSON blobs) and `listings_flattened` (same subtrees expanded into individual columns) so you can compare the two shapes side by side. ## What You Will Learn 1. How `json_paths: ["$.location", "$.pricing"]` turns two subtrees into two VARCHAR columns containing raw JSON. 2. How `json_paths` must be combined with `include_paths` — the path must appear in BOTH for the column to materialize. 3. How captured subtrees coexist with normally flattened top-level fields (`id`, `title`, `bedrooms`). 4. How `column_mappings` renames both flattened fields AND captured JSON columns for readability. 5. How the same source files produce a fully flattened shape (`location_address_city`, `pricing_mortgage_estimate_monthly_payment`) when `json_paths` is omitted. 6. How to JOIN the captured and flattened tables on a shared key to demonstrate equivalence. 7. How `LIKE '{%'` and `LIKE '%list_price%'` are pragmatic integrity checks on captured JSON. 8. When to prefer captured JSON (API pass-through, schema churn, unknown consumers) over full flattening (analytics, BI, direct SQL predicates). ## Prerequisites A DeltaForge catalog with create-zone / create-schema permission and the two listing files (`01_listings_residential.json`, `02_listings_commercial.json`, 5 listings total) in the `{{data_path}}` directory. Reviewing the music-catalog demo first gives useful context for `json_paths` vs. `explode_paths`.

Pitfalls

`json_paths` only works if the path is ALSO in `include_paths`. If you list `$.location` in `json_paths` but forget it in `include_paths`, the `location` column never materializes and the ASSERT `LIKE '{%'` silently tests a NULL column (count = 0) — a confusing failure mode.
Captured JSON columns are plain VARCHAR; SQL cannot filter on nested fields without parsing. `WHERE location LIKE '%San Francisco%'` is brittle (substring match collides with "South San Francisco"); prefer `json_extract`/`->>` for real predicates, and reserve captured JSON for pass-through scenarios.
When `json_paths` captures a subtree, `max_depth` no longer controls its internal structure — the whole subtree is serialized regardless of depth. Shrinking `max_depth` to 3 will NOT truncate the captured blob, it will only affect non-captured branches.
The captured string preserves key ordering from the source file, not alphabetical order. Equality comparisons between two captured columns (`c1.location = c2.location`) can fail even when the logical object is identical; normalize via `json_extract` or a canonicalizer before comparing.
`default_array_handling: "to_json"` means the `tags` array is emitted as a JSON string in BOTH tables. Casting `tags` to a native array for `UNNEST` requires a parser step — direct `WHERE 'luxury' = ANY(tags)` will not work.
The flattened table's column names hard-code the source path structure (`pricing_mortgage_estimate_monthly_payment`). If the producer renames `mortgage_estimate` to `mortgage`, the column disappears and downstream views break — captured JSON is more resilient to producer-side renames at the cost of SQL ergonomics.

JSON Subtree Capture — json_paths

Syntax

Description

Pitfalls

See Also