XML NYT News: RSS Feed Analysis

Read 7 regional NYT RSS XML feeds (231 items, 4 namespaces) into a joined-categories article table and an exploded category table for keyword analytics.

Category: xml

Syntax

-- === SETUP ===
CREATE ZONE IF NOT EXISTS {{zone_name}} TYPE EXTERNAL
    COMMENT 'External tables, demo datasets and file-backed data';
CREATE SCHEMA IF NOT EXISTS {{zone_name}}.xml_demos
    COMMENT 'XML-backed external tables';

-- Table 1: one row per article, categories joined into a comma string
CREATE EXTERNAL TABLE IF NOT EXISTS {{zone_name}}.xml_demos.news_articles
USING XML
LOCATION '{{data_path}}'
OPTIONS (
    xml_flatten_config = '{
        "row_xpath": "//item",
        "include_paths": [
            "/rss/channel/item/title",
            "/rss/channel/item/link",
            "/rss/channel/item/guid",
            "/rss/channel/item/guid/@isPermaLink",
            "/rss/channel/item/description",
            "/rss/channel/item/creator",
            "/rss/channel/item/pubDate",
            "/rss/channel/item/category",
            "/rss/channel/item/content/@url",
            "/rss/channel/item/content/@height",
            "/rss/channel/item/content/@width",
            "/rss/channel/item/content/@medium",
            "/rss/channel/item/credit"
        ],
        "exclude_paths": [
            "/rss/channel/image",
            "/rss/channel/title",
            "/rss/channel/link",
            "/rss/channel/description",
            "/rss/channel/language",
            "/rss/channel/copyright",
            "/rss/channel/lastBuildDate",
            "/rss/channel/pubDate"
        ],
        "default_repeat_handling": "join_comma",
        "column_mappings": {
            "/rss/channel/item/title":                  "rss_channel_item_title",
            "/rss/channel/item/link":                   "rss_channel_item_link",
            "/rss/channel/item/guid":                   "rss_channel_item_guid",
            "/rss/channel/item/guid/@isPermaLink":      "rss_channel_item_guid_attr_is_perma_link",
            "/rss/channel/item/description":            "rss_channel_item_description",
            "/rss/channel/item/creator":                "rss_channel_item_creator",
            "/rss/channel/item/pubDate":                "rss_channel_item_pub_date",
            "/rss/channel/item/category":               "rss_channel_item_category",
            "/rss/channel/item/credit":                 "rss_channel_item_credit",
            "/rss/channel/item/content/@url":           "rss_channel_item_content_attr_url",
            "/rss/channel/item/content/@height":        "rss_channel_item_content_attr_height",
            "/rss/channel/item/content/@width":         "rss_channel_item_content_attr_width",
            "/rss/channel/item/content/@medium":        "rss_channel_item_content_attr_medium"
        },
        "include_attributes": true,
        "separator": "_",
        "max_depth": 10,
        "strip_namespace_prefixes": true,
        "namespaces": {
            "dc":    "http://purl.org/dc/elements/1.1/",
            "media": "http://search.yahoo.com/mrss/",
            "atom":  "http://www.w3.org/2005/Atom",
            "nyt":   "http://www.nytimes.com/namespaces/rss/2.0"
        }
    }',
    file_metadata = '{"columns":["df_file_name","df_file_modified","df_dataset","df_row_number"]}'
);
GRANT ADMIN ON TABLE {{zone_name}}.xml_demos.news_articles TO USER {{current_user}};

-- Table 2: one row per <category> per article (for keyword analytics)
CREATE EXTERNAL TABLE IF NOT EXISTS {{zone_name}}.xml_demos.news_categories
USING XML
LOCATION '{{data_path}}'
OPTIONS (
    xml_flatten_config = '{
        "row_xpath": "//item",
        "explode_paths": ["/rss/channel/item/category"],
        "include_paths": [
            "/rss/channel/item/title",
            "/rss/channel/item/link",
            "/rss/channel/item/creator",
            "/rss/channel/item/pubDate",
            "/rss/channel/item/category",
            "/rss/channel/item/category/@domain"
        ],
        "exclude_paths": [
            "/rss/channel/image",
            "/rss/channel/title",
            "/rss/channel/link",
            "/rss/channel/description",
            "/rss/channel/language",
            "/rss/channel/copyright",
            "/rss/channel/lastBuildDate",
            "/rss/channel/pubDate"
        ],
        "column_mappings": {
            "/rss/channel/item/title":             "rss_channel_item_title",
            "/rss/channel/item/link":              "rss_channel_item_link",
            "/rss/channel/item/creator":           "rss_channel_item_creator",
            "/rss/channel/item/pubDate":           "rss_channel_item_pub_date",
            "/rss/channel/item/category":          "rss_channel_item_category",
            "/rss/channel/item/category/@domain":  "rss_channel_item_category_attr_domain"
        },
        "include_attributes": true,
        "separator": "_",
        "max_depth": 10,
        "strip_namespace_prefixes": true,
        "namespaces": {
            "dc":    "http://purl.org/dc/elements/1.1/",
            "media": "http://search.yahoo.com/mrss/",
            "atom":  "http://www.w3.org/2005/Atom",
            "nyt":   "http://www.nytimes.com/namespaces/rss/2.0"
        }
    }',
    file_metadata = '{"columns":["df_file_name","df_file_modified","df_dataset","df_row_number"]}'
);
GRANT ADMIN ON TABLE {{zone_name}}.xml_demos.news_categories TO USER {{current_user}};

-- === QUERIES ===
-- 1. Total article count across 7 regional files
ASSERT ROW_COUNT = 231
SELECT * FROM {{zone_name}}.xml_demos.news_articles;

-- 2. Namespace stripping: every item has an author extracted from <dc:creator>
ASSERT VALUE author_count = 231
SELECT COUNT(*) FILTER (WHERE rss_channel_item_creator IS NOT NULL) AS author_count
FROM {{zone_name}}.xml_demos.news_articles;

-- 3. Self-closing attribute extraction: thumbnail URLs from <media:content/>
ASSERT VALUE thumbnail_count = 215
SELECT COUNT(*) FILTER (WHERE rss_channel_item_content_attr_url LIKE 'https://%') AS thumbnail_count
FROM {{zone_name}}.xml_demos.news_articles;

-- 4. Per-region counts using df_file_name metadata
ASSERT ROW_COUNT = 7
SELECT df_file_name AS region_file, COUNT(*) AS article_count
FROM {{zone_name}}.xml_demos.news_articles
GROUP BY df_file_name
ORDER BY article_count DESC;

-- 5. Explode: one row per <category> per article
ASSERT ROW_COUNT = 2023
SELECT * FROM {{zone_name}}.xml_demos.news_categories;

-- 6. People keyword analytics via @domain
ASSERT ROW_COUNT = 10
SELECT rss_channel_item_category AS person,
       COUNT(*)                   AS mention_count
FROM {{zone_name}}.xml_demos.news_categories
WHERE rss_channel_item_category_attr_domain LIKE '%nyt_per'
GROUP BY rss_channel_item_category
ORDER BY mention_count DESC
LIMIT 10;

-- === CLEANUP ===
DROP EXTERNAL TABLE IF EXISTS {{zone_name}}.xml_demos.news_articles  WITH FILES;
DROP EXTERNAL TABLE IF EXISTS {{zone_name}}.xml_demos.news_categories WITH FILES;
DROP SCHEMA IF EXISTS {{zone_name}}.xml_demos;
DROP ZONE IF EXISTS {{zone_name}};

Description

## When to Use Use this pattern for real-world XML feeds that carry multiple namespaces (RSS plus Dublin Core plus Media RSS plus custom), repeating child elements that must sometimes be aggregated and sometimes exploded, and self-closing elements whose interesting data lives in attributes. RSS, Atom, sitemap, and many SOAP response shapes all fit this mould. ## What You Will Learn 1. How to register `namespaces` in the flatten config and combine it with `strip_namespace_prefixes: true` so `<dc:creator>` flattens to a clean path like `/rss/channel/item/creator`. 2. How `default_repeat_handling: "join_comma"` collapses 0-to-N `<category>` children per article into a single comma-joined string column. 3. How a second external table over the same `LOCATION` can use `explode_paths` on the category path to give one row per category, ideal for keyword frequency analytics. 4. How the `@domain` attribute on a repeated element survives both the join and the explode so you can filter `WHERE category_attr_domain LIKE '%nyt_per'` to isolate people, places, or organisations. 5. How `<media:content url="..." height="..."/>` self-closing elements project every attribute as its own column (`content_attr_url`, `content_attr_height`, `content_attr_width`, `content_attr_medium`). 6. How `exclude_paths` prunes channel-level metadata (`/rss/channel/title`, `/rss/channel/lastBuildDate`, etc.) that would otherwise pollute the per-item row. 7. How `df_file_name` file-metadata identifies which region a row came from, enabling per-region analytics without a separate column in the XML. 8. How optional elements (`<media:description>`, some namespaced extensions) become `NULL` rather than errors for items that lack them. ## Prerequisites An external zone, seven NYT-style RSS XML files (`Africa.xml`, `Americas.xml`, `AsiaPacific.xml`, `Europe.xml`, `MiddleEast.xml`, `World.xml`, `news.xml`), and `ADMIN` on the target schema. General familiarity with XML namespaces and RSS 2.0 helps when reading the field counts.

Pitfalls

`strip_namespace_prefixes: true` is not enough on its own, you also need the `namespaces` map so the parser can resolve the prefix bindings when matching paths. Leaving `namespaces` empty causes `<dc:creator>` to be ignored because the prefix is unbound.
Every regional file binds the same prefixes (`xmlns:dc=...`, `xmlns:media=...`), but if a future feed rebinds `dc:` to a different URI the column will silently go `NULL`, match namespaces by URI, not by prefix text.
Default repeat handling is a table-level knob. `default_repeat_handling: "join_comma"` applies to every repeated path that is not in `explode_paths`; if you also want `<enclosure>` exploded, list it explicitly or it will be comma-joined too.
When a category is joined to comma, the `@domain` attribute is lost on the article table. Keep the exploded categories table if you need to filter by domain.
`<media:content>` is self-closing, so its text content is the empty string. Filter on its attributes (`content_attr_url`) rather than on the element itself, otherwise `WHERE content IS NOT NULL` matches nothing.
HTML entities in `<description>` (`>`, `'`, `&`) are decoded by the parser. Downstream consumers should not attempt to decode a second time or they will mangle any legitimate `&` in the text.
`exclude_paths` at channel level is essential, without it every `<item>` row inherits the channel's `<title>`, `<link>`, etc. and both columns become ambiguous with the item-level fields of the same name.

Open in interactive docs → DeltaForge home →