Fix the small-files problem caused by daily micro-batch ingestion by compacting with OPTIMIZE and then purging orphaned versions with VACUUM, while proving data integrity stays intact.
-- SETUP
CREATE ZONE IF NOT EXISTS demo_zone TYPE EXTERNAL
COMMENT 'External and Delta tables -- demo datasets';
CREATE SCHEMA IF NOT EXISTS demo_zone.delta_demos
COMMENT 'Delta table management tutorial demos';
CREATE DELTA TABLE IF NOT EXISTS demo_zone.delta_demos.order_pipeline (
id INT,
order_ref VARCHAR,
category VARCHAR,
product VARCHAR,
price DOUBLE,
status VARCHAR,
order_date VARCHAR
) LOCATION '~/delta-data/order_pipeline';
-- Monday batch
INSERT INTO demo_zone.delta_demos.order_pipeline VALUES
(1, 'ORD-1001', 'Electronics', 'Wireless Headphones', 79.99, 'pending', '2025-03-03'),
(2, 'ORD-1002', 'Electronics', 'USB-C Hub', 34.99, 'pending', '2025-03-03'),
(3, 'ORD-1003', 'Clothing', 'Running Shoes', 129.99, 'pending', '2025-03-03'),
(4, 'ORD-1004', 'Home', 'Desk Lamp', 45.00, 'pending', '2025-03-03'),
(5, 'ORD-1005', 'Electronics', 'Keyboard', 89.99, 'pending', '2025-03-03'),
(6, 'ORD-1006', 'Clothing', 'Winter Jacket', 199.99, 'pending', '2025-03-03'),
(7, 'ORD-1007', 'Home', 'Coffee Maker', 65.00, 'pending', '2025-03-03'),
(8, 'ORD-1008', 'Books', 'SQL Cookbook', 49.99, 'pending', '2025-03-03');
-- Tuesday batch
INSERT INTO demo_zone.delta_demos.order_pipeline VALUES
(9, 'ORD-2001', 'Electronics', 'Monitor Stand', 39.99, 'pending', '2025-03-04'),
(10, 'ORD-2002', 'Clothing', 'Hiking Boots', 159.99, 'pending', '2025-03-04'),
(11, 'ORD-2003', 'Home', 'Air Purifier', 189.00, 'pending', '2025-03-04'),
(12, 'ORD-2004', 'Books', 'Data Engineering', 59.99, 'pending', '2025-03-04'),
(13, 'ORD-2005', 'Electronics', 'Webcam', 69.99, 'pending', '2025-03-04'),
(14, 'ORD-2006', 'Clothing', 'Polo Shirt', 35.00, 'pending', '2025-03-04'),
(15, 'ORD-2007', 'Home', 'Plant Pot Set', 28.00, 'pending', '2025-03-04'),
(16, 'ORD-2008', 'Books', 'Rust Programming', 44.99, 'pending', '2025-03-04');
-- Wednesday batch
INSERT INTO demo_zone.delta_demos.order_pipeline VALUES
(17, 'ORD-3001', 'Electronics', 'Tablet Stand', 24.99, 'pending', '2025-03-05'),
(18, 'ORD-3002', 'Clothing', 'Denim Jeans', 79.99, 'pending', '2025-03-05'),
(19, 'ORD-3003', 'Home', 'Bookshelf', 125.00, 'pending', '2025-03-05'),
(20, 'ORD-3004', 'Books', 'Clean Code', 39.99, 'pending', '2025-03-05'),
(21, 'ORD-3005', 'Electronics', 'Power Bank', 29.99, 'pending', '2025-03-05'),
(22, 'ORD-3006', 'Clothing', 'Sneakers', 95.00, 'pending', '2025-03-05'),
(23, 'ORD-3007', 'Home', 'Throw Blanket', 42.00, 'pending', '2025-03-05'),
(24, 'ORD-3008', 'Books', 'System Design', 54.99, 'pending', '2025-03-05');
-- Thursday batch
INSERT INTO demo_zone.delta_demos.order_pipeline VALUES
(25, 'ORD-4001', 'Electronics', 'Mouse Pad', 19.99, 'pending', '2025-03-06'),
(26, 'ORD-4002', 'Clothing', 'Windbreaker', 85.00, 'pending', '2025-03-06'),
(27, 'ORD-4003', 'Home', 'Kitchen Scale', 32.00, 'pending', '2025-03-06'),
(28, 'ORD-4004', 'Books', 'DDIA', 49.99, 'pending', '2025-03-06'),
(29, 'ORD-4005', 'Electronics', 'HDMI Cable', 12.99, 'pending', '2025-03-06'),
(30, 'ORD-4006', 'Clothing', 'Rain Coat', 110.00, 'pending', '2025-03-06'),
(31, 'ORD-4007', 'Home', 'Wall Clock', 38.00, 'pending', '2025-03-06'),
(32, 'ORD-4008', 'Books', 'The Pragmatic Prog.', 44.99, 'pending', '2025-03-06');
-- Friday batch
INSERT INTO demo_zone.delta_demos.order_pipeline VALUES
(33, 'ORD-5001', 'Electronics', 'Phone Case', 15.99, 'pending', '2025-03-07'),
(34, 'ORD-5002', 'Clothing', 'Baseball Cap', 22.00, 'pending', '2025-03-07'),
(35, 'ORD-5003', 'Home', 'Candle Set', 27.00, 'pending', '2025-03-07'),
(36, 'ORD-5004', 'Books', 'Learning SQL', 34.99, 'pending', '2025-03-07'),
(37, 'ORD-5005', 'Electronics', 'Screen Protector', 9.99, 'pending', '2025-03-07'),
(38, 'ORD-5006', 'Clothing', 'Swim Trunks', 40.00, 'pending', '2025-03-07'),
(39, 'ORD-5007', 'Home', 'Picture Frame', 18.00, 'pending', '2025-03-07'),
(40, 'ORD-5008', 'Books', 'Refactoring', 42.99, 'pending', '2025-03-07');
-- Ship Monday's batch
UPDATE demo_zone.delta_demos.order_pipeline
SET status = 'shipped'
WHERE id BETWEEN 1 AND 8;
-- Cancel 3 orders
DELETE FROM demo_zone.delta_demos.order_pipeline
WHERE id IN (11, 22, 37);
-- Fix a $5.00 missing surcharge on 4 orders
UPDATE demo_zone.delta_demos.order_pipeline
SET price = ROUND(price + 5.00, 2)
WHERE id IN (9, 17, 25, 33);
-- QUERIES
-- Pre-maintenance: fragmented file state
DESCRIBE DETAIL demo_zone.delta_demos.order_pipeline;
-- Integrity baseline: capture these numbers -- they must match after maintenance
SELECT category,
COUNT(*) AS order_count,
ROUND(SUM(price), 2) AS revenue
FROM demo_zone.delta_demos.order_pipeline
GROUP BY category
ORDER BY category;
SELECT status,
COUNT(*) AS status_count,
ROUND(SUM(price), 2) AS status_revenue
FROM demo_zone.delta_demos.order_pipeline
GROUP BY status
ORDER BY status;
-- STEP 1: OPTIMIZE -- compact small files into fewer large files
OPTIMIZE demo_zone.delta_demos.order_pipeline;
-- Post-OPTIMIZE: live files are compacted, but pre-OPTIMIZE files are now orphans on storage
DESCRIBE DETAIL demo_zone.delta_demos.order_pipeline;
-- STEP 2: VACUUM RETAIN 0 HOURS -- purge every orphan immediately
-- In production, keep the default 168 HOURS (7 days) so in-flight readers are not broken
VACUUM demo_zone.delta_demos.order_pipeline RETAIN 0 HOURS;
-- Post-VACUUM: storage now contains only the compacted files
DESCRIBE DETAIL demo_zone.delta_demos.order_pipeline;
-- INTEGRITY PROOF: aggregates must be identical to the pre-maintenance baseline
SELECT COUNT(*) AS total_orders,
ROUND(SUM(price), 2) AS total_revenue,
COUNT(DISTINCT category) AS categories
FROM demo_zone.delta_demos.order_pipeline;
-- Shipped rows are intact -- OPTIMIZE + VACUUM did not touch logical data
SELECT id, order_ref, product, price, status
FROM demo_zone.delta_demos.order_pipeline
WHERE status = 'shipped'
ORDER BY id;
-- Cancelled rows stay deleted -- maintenance never resurrects dropped rows
SELECT COUNT(*) AS cancelled_still_deleted
FROM demo_zone.delta_demos.order_pipeline
WHERE id IN (11, 22, 37);
-- Price fixes survived both steps
SELECT id, order_ref, product, price
FROM demo_zone.delta_demos.order_pipeline
WHERE id IN (9, 17, 25, 33)
ORDER BY id;
-- CLEANUP
DROP DELTA TABLE IF EXISTS demo_zone.delta_demos.order_pipeline WITH FILES;
DROP SCHEMA IF EXISTS demo_zone.delta_demos;
DROP ZONE IF EXISTS demo_zone;
## When to Use Anywhere a Delta table is fed by frequent small writes -- streaming, micro-batch jobs, or chatty CDC pipelines. Each small INSERT lands as its own Parquet file, and every UPDATE / DELETE rewrites files it touches, leaving the old versions on storage. Without maintenance, two problems compound: read performance degrades from opening thousands of tiny files, and storage fills with orphaned pre-mutation files. The standard playbook is a two-step sequence: OPTIMIZE first to compact the live files into fewer large ones, then VACUUM to delete the now-orphaned pre-compaction files. This demo stages the full problem (5 daily batches + UPDATE + DELETE + UPDATE) and runs the playbook while asserting that row counts, sums and specific row values stay identical before and after. ## What You Will Learn 1. How small-files accumulation happens naturally in streaming / micro-batch pipelines 2. How OPTIMIZE reads fragmented files and rewrites them into fewer, larger files -- a physical-only transform that does not change logical data 3. How DESCRIBE DETAIL reveals file count and size at each stage 4. How VACUUM RETAIN 0 HOURS purges ALL orphaned files immediately (default retention is 168 hours = 7 days) 5. Why the order matters: OPTIMIZE before VACUUM -- OPTIMIZE creates new orphans that VACUUM then cleans up in one pass 6. That OPTIMIZE adds a new commit to the version log while VACUUM does not -- VACUUM is a storage operation, not a data operation 7. How to prove data integrity with aggregates (total_orders, total_revenue, category counts) that must match byte-for-byte before and after maintenance ## Prerequisites None. The setup stages 40 inserted rows across 5 daily batches, then applies an UPDATE (ship Monday), a DELETE (3 cancellations), and another UPDATE (price fix on 4 rows) -- ending with 37 rows across many small files with multiple orphaned versions.
| Name | Type | Description |
|---|---|---|
TABLE_PATH | Filesystem or object-store location for the Delta table. | |
VACUUM_RETAIN_HOURS | Hours of history to retain. Use 0 for immediate cleanup (demos/tests). Production default is 168 (7 days) to avoid breaking in-flight readers. |