Read 7 regional NYT RSS XML feeds (231 items, 4 namespaces) into a joined-categories article table and an exploded category table for keyword analytics.
-- === SETUP ===
CREATE ZONE IF NOT EXISTS {{zone_name}} TYPE EXTERNAL
COMMENT 'External tables, demo datasets and file-backed data';
CREATE SCHEMA IF NOT EXISTS {{zone_name}}.xml_demos
COMMENT 'XML-backed external tables';
-- Table 1: one row per article, categories joined into a comma string
CREATE EXTERNAL TABLE IF NOT EXISTS {{zone_name}}.xml_demos.news_articles
USING XML
LOCATION '{{data_path}}'
OPTIONS (
xml_flatten_config = '{
"row_xpath": "//item",
"include_paths": [
"/rss/channel/item/title",
"/rss/channel/item/link",
"/rss/channel/item/guid",
"/rss/channel/item/guid/@isPermaLink",
"/rss/channel/item/description",
"/rss/channel/item/creator",
"/rss/channel/item/pubDate",
"/rss/channel/item/category",
"/rss/channel/item/content/@url",
"/rss/channel/item/content/@height",
"/rss/channel/item/content/@width",
"/rss/channel/item/content/@medium",
"/rss/channel/item/credit"
],
"exclude_paths": [
"/rss/channel/image",
"/rss/channel/title",
"/rss/channel/link",
"/rss/channel/description",
"/rss/channel/language",
"/rss/channel/copyright",
"/rss/channel/lastBuildDate",
"/rss/channel/pubDate"
],
"default_repeat_handling": "join_comma",
"column_mappings": {
"/rss/channel/item/title": "rss_channel_item_title",
"/rss/channel/item/link": "rss_channel_item_link",
"/rss/channel/item/guid": "rss_channel_item_guid",
"/rss/channel/item/guid/@isPermaLink": "rss_channel_item_guid_attr_is_perma_link",
"/rss/channel/item/description": "rss_channel_item_description",
"/rss/channel/item/creator": "rss_channel_item_creator",
"/rss/channel/item/pubDate": "rss_channel_item_pub_date",
"/rss/channel/item/category": "rss_channel_item_category",
"/rss/channel/item/credit": "rss_channel_item_credit",
"/rss/channel/item/content/@url": "rss_channel_item_content_attr_url",
"/rss/channel/item/content/@height": "rss_channel_item_content_attr_height",
"/rss/channel/item/content/@width": "rss_channel_item_content_attr_width",
"/rss/channel/item/content/@medium": "rss_channel_item_content_attr_medium"
},
"include_attributes": true,
"separator": "_",
"max_depth": 10,
"strip_namespace_prefixes": true,
"namespaces": {
"dc": "http://purl.org/dc/elements/1.1/",
"media": "http://search.yahoo.com/mrss/",
"atom": "http://www.w3.org/2005/Atom",
"nyt": "http://www.nytimes.com/namespaces/rss/2.0"
}
}',
file_metadata = '{"columns":["df_file_name","df_file_modified","df_dataset","df_row_number"]}'
);
GRANT ADMIN ON TABLE {{zone_name}}.xml_demos.news_articles TO USER {{current_user}};
-- Table 2: one row per <category> per article (for keyword analytics)
CREATE EXTERNAL TABLE IF NOT EXISTS {{zone_name}}.xml_demos.news_categories
USING XML
LOCATION '{{data_path}}'
OPTIONS (
xml_flatten_config = '{
"row_xpath": "//item",
"explode_paths": ["/rss/channel/item/category"],
"include_paths": [
"/rss/channel/item/title",
"/rss/channel/item/link",
"/rss/channel/item/creator",
"/rss/channel/item/pubDate",
"/rss/channel/item/category",
"/rss/channel/item/category/@domain"
],
"exclude_paths": [
"/rss/channel/image",
"/rss/channel/title",
"/rss/channel/link",
"/rss/channel/description",
"/rss/channel/language",
"/rss/channel/copyright",
"/rss/channel/lastBuildDate",
"/rss/channel/pubDate"
],
"column_mappings": {
"/rss/channel/item/title": "rss_channel_item_title",
"/rss/channel/item/link": "rss_channel_item_link",
"/rss/channel/item/creator": "rss_channel_item_creator",
"/rss/channel/item/pubDate": "rss_channel_item_pub_date",
"/rss/channel/item/category": "rss_channel_item_category",
"/rss/channel/item/category/@domain": "rss_channel_item_category_attr_domain"
},
"include_attributes": true,
"separator": "_",
"max_depth": 10,
"strip_namespace_prefixes": true,
"namespaces": {
"dc": "http://purl.org/dc/elements/1.1/",
"media": "http://search.yahoo.com/mrss/",
"atom": "http://www.w3.org/2005/Atom",
"nyt": "http://www.nytimes.com/namespaces/rss/2.0"
}
}',
file_metadata = '{"columns":["df_file_name","df_file_modified","df_dataset","df_row_number"]}'
);
GRANT ADMIN ON TABLE {{zone_name}}.xml_demos.news_categories TO USER {{current_user}};
-- === QUERIES ===
-- 1. Total article count across 7 regional files
ASSERT ROW_COUNT = 231
SELECT * FROM {{zone_name}}.xml_demos.news_articles;
-- 2. Namespace stripping: every item has an author extracted from <dc:creator>
ASSERT VALUE author_count = 231
SELECT COUNT(*) FILTER (WHERE rss_channel_item_creator IS NOT NULL) AS author_count
FROM {{zone_name}}.xml_demos.news_articles;
-- 3. Self-closing attribute extraction: thumbnail URLs from <media:content/>
ASSERT VALUE thumbnail_count = 215
SELECT COUNT(*) FILTER (WHERE rss_channel_item_content_attr_url LIKE 'https://%') AS thumbnail_count
FROM {{zone_name}}.xml_demos.news_articles;
-- 4. Per-region counts using df_file_name metadata
ASSERT ROW_COUNT = 7
SELECT df_file_name AS region_file, COUNT(*) AS article_count
FROM {{zone_name}}.xml_demos.news_articles
GROUP BY df_file_name
ORDER BY article_count DESC;
-- 5. Explode: one row per <category> per article
ASSERT ROW_COUNT = 2023
SELECT * FROM {{zone_name}}.xml_demos.news_categories;
-- 6. People keyword analytics via @domain
ASSERT ROW_COUNT = 10
SELECT rss_channel_item_category AS person,
COUNT(*) AS mention_count
FROM {{zone_name}}.xml_demos.news_categories
WHERE rss_channel_item_category_attr_domain LIKE '%nyt_per'
GROUP BY rss_channel_item_category
ORDER BY mention_count DESC
LIMIT 10;
-- === CLEANUP ===
DROP EXTERNAL TABLE IF EXISTS {{zone_name}}.xml_demos.news_articles WITH FILES;
DROP EXTERNAL TABLE IF EXISTS {{zone_name}}.xml_demos.news_categories WITH FILES;
DROP SCHEMA IF EXISTS {{zone_name}}.xml_demos;
DROP ZONE IF EXISTS {{zone_name}};
## When to Use Use this pattern for real-world XML feeds that carry multiple namespaces (RSS plus Dublin Core plus Media RSS plus custom), repeating child elements that must sometimes be aggregated and sometimes exploded, and self-closing elements whose interesting data lives in attributes. RSS, Atom, sitemap, and many SOAP response shapes all fit this mould. ## What You Will Learn 1. How to register `namespaces` in the flatten config and combine it with `strip_namespace_prefixes: true` so `<dc:creator>` flattens to a clean path like `/rss/channel/item/creator`. 2. How `default_repeat_handling: "join_comma"` collapses 0-to-N `<category>` children per article into a single comma-joined string column. 3. How a second external table over the same `LOCATION` can use `explode_paths` on the category path to give one row per category, ideal for keyword frequency analytics. 4. How the `@domain` attribute on a repeated element survives both the join and the explode so you can filter `WHERE category_attr_domain LIKE '%nyt_per'` to isolate people, places, or organisations. 5. How `<media:content url="..." height="..."/>` self-closing elements project every attribute as its own column (`content_attr_url`, `content_attr_height`, `content_attr_width`, `content_attr_medium`). 6. How `exclude_paths` prunes channel-level metadata (`/rss/channel/title`, `/rss/channel/lastBuildDate`, etc.) that would otherwise pollute the per-item row. 7. How `df_file_name` file-metadata identifies which region a row came from, enabling per-region analytics without a separate column in the XML. 8. How optional elements (`<media:description>`, some namespaced extensions) become `NULL` rather than errors for items that lack them. ## Prerequisites An external zone, seven NYT-style RSS XML files (`Africa.xml`, `Americas.xml`, `AsiaPacific.xml`, `Europe.xml`, `MiddleEast.xml`, `World.xml`, `news.xml`), and `ADMIN` on the target schema. General familiarity with XML namespaces and RSS 2.0 helps when reading the field counts.