SQL for Data Analysis: The Queries That Actually Matter
Window functions, CTEs, cohort retention, funnels, and the 10 queries that cover 80% of real-world business analysis. All PostgreSQL, all with realistic data.
Master these 10 queries and you can answer almost any business question
Window Functions: The Power Tool
Window functions operate on a set of rows related to the current row - without collapsing them into a single output row like GROUP BY does. They're the single most important SQL feature for data analysis.
ROW_NUMBER, RANK, DENSE_RANK
-- ROW_NUMBER: unique sequential numbers (great for dedup)
SELECT
customer_id,
order_date,
amount,
ROW_NUMBER() OVER (PARTITION BY customer_id ORDER BY order_date DESC) AS rn
FROM orders;
-- rn: 1, 2, 3, 4 (no ties)
-- RANK: same number for ties, skips next
SELECT
product_id,
revenue,
RANK() OVER (ORDER BY revenue DESC) AS revenue_rank
FROM product_sales;
-- rank: 1, 2, 2, 4 (skips 3)
-- DENSE_RANK: same number for ties, no skip
SELECT
product_id,
revenue,
DENSE_RANK() OVER (ORDER BY revenue DESC) AS dense_rnk
FROM product_sales;
-- dense_rnk: 1, 2, 2, 3 (no skip)
LAG / LEAD - Compare Adjacent Rows
SELECT
date_trunc('month', order_date) AS month,
SUM(amount) AS revenue,
LAG(SUM(amount)) OVER (ORDER BY date_trunc('month', order_date)) AS prev_month,
ROUND((SUM(amount) - LAG(SUM(amount)) OVER (ORDER BY date_trunc('month', order_date)))
/ LAG(SUM(amount)) OVER (ORDER BY date_trunc('month', order_date)) * 100, 1
) AS pct_change
FROM orders GROUP BY 1 ORDER BY 1;
Running Totals & Moving Averages
-- Running total
SELECT order_date, amount,
SUM(amount) OVER (ORDER BY order_date ROWS UNBOUNDED PRECEDING) AS running_total
FROM orders;
-- 7-day moving average
SELECT day, daily_revenue,
AVG(daily_revenue) OVER (ORDER BY day ROWS BETWEEN 6 PRECEDING AND CURRENT ROW) AS ma_7d
FROM (SELECT order_date AS day, SUM(amount) AS daily_revenue FROM orders GROUP BY 1) d;
Percent of Total
-- Each product's share of total revenue
SELECT
product_name,
SUM(amount) AS revenue,
ROUND(
SUM(amount) * 100.0 / SUM(SUM(amount)) OVER (), 2
) AS pct_of_total
FROM order_items
JOIN products USING (product_id)
GROUP BY product_name
ORDER BY revenue DESC;
CTEs: Readable, Composable Queries
Common Table Expressions let you break complex queries into named, readable steps. They're the difference between a 200-line nested subquery nightmare and something a human can actually understand.
Chaining CTEs
WITH active_users AS (
SELECT user_id, COUNT(*) AS login_count
FROM logins WHERE login_date >= CURRENT_DATE - INTERVAL '30 days'
GROUP BY user_id HAVING COUNT(*) >= 3
),
user_revenue AS (
SELECT u.user_id, u.login_count, COALESCE(SUM(o.amount), 0) AS revenue_30d
FROM active_users u
LEFT JOIN orders o ON o.user_id = u.user_id
AND o.order_date >= CURRENT_DATE - INTERVAL '30 days'
GROUP BY u.user_id, u.login_count
)
SELECT
CASE WHEN revenue_30d >= 500 THEN 'whale'
WHEN revenue_30d >= 100 THEN 'regular'
WHEN revenue_30d > 0 THEN 'light' ELSE 'free' END AS segment,
COUNT(*) AS users, ROUND(AVG(revenue_30d), 2) AS avg_revenue
FROM user_revenue GROUP BY 1 ORDER BY avg_revenue DESC;
Recursive CTEs - Hierarchies
-- Org chart: find all reports under a manager
WITH RECURSIVE org_tree AS (
-- Base case: the manager
SELECT employee_id, name, manager_id, 0 AS depth
FROM employees
WHERE employee_id = 42
UNION ALL
-- Recursive case: their reports
SELECT e.employee_id, e.name, e.manager_id, t.depth + 1
FROM employees e
JOIN org_tree t ON e.manager_id = t.employee_id
WHERE t.depth < 10 -- safety limit
)
SELECT * FROM org_tree ORDER BY depth, name;
Advanced Aggregations
GROUPING SETS & ROLLUP
SELECT
COALESCE(region, '(All Regions)') AS region,
COALESCE(product_category, '(All Products)') AS category,
SUM(amount) AS revenue, COUNT(*) AS orders
FROM orders JOIN products USING (product_id)
GROUP BY GROUPING SETS (
(region, product_category),
(region), (product_category), ()
)
ORDER BY region NULLS LAST, category NULLS LAST;
-- ROLLUP: hierarchical subtotals (year → quarter → month)
SELECT
EXTRACT(YEAR FROM order_date) AS yr,
EXTRACT(QUARTER FROM order_date) AS qtr,
SUM(amount) AS revenue
FROM orders
GROUP BY ROLLUP (EXTRACT(YEAR FROM order_date), EXTRACT(QUARTER FROM order_date))
ORDER BY yr NULLS LAST, qtr NULLS LAST;
FILTER - Conditional Aggregation
SELECT date_trunc('month', created_at) AS month,
COUNT(*) AS total_users,
COUNT(*) FILTER (WHERE plan = 'pro') AS pro_users,
COUNT(*) FILTER (WHERE plan = 'free') AS free_users,
ROUND(COUNT(*) FILTER (WHERE plan = 'pro') * 100.0 / COUNT(*), 1) AS pro_pct
FROM users GROUP BY 1 ORDER BY 1;
Pivoting Data
-- Pivot: rows to columns using conditional aggregation
SELECT product_category,
SUM(amount) FILTER (WHERE EXTRACT(QUARTER FROM order_date) = 1) AS q1,
SUM(amount) FILTER (WHERE EXTRACT(QUARTER FROM order_date) = 2) AS q2,
SUM(amount) FILTER (WHERE EXTRACT(QUARTER FROM order_date) = 3) AS q3,
SUM(amount) FILTER (WHERE EXTRACT(QUARTER FROM order_date) = 4) AS q4,
SUM(amount) AS total
FROM orders JOIN products USING (product_id)
WHERE order_date >= '2025-01-01'
GROUP BY 1 ORDER BY total DESC;
-- Dynamic pivot with JSONB
SELECT user_id, jsonb_object_agg(event_type, event_count) AS events
FROM (
SELECT user_id, event_type, COUNT(*) AS event_count
FROM events GROUP BY user_id, event_type
) sub GROUP BY user_id;
Date/Time Analysis
Date Spine - Fill Missing Days
WITH date_spine AS (
SELECT generate_series('2025-01-01'::date, '2025-12-31'::date, '1 day')::date AS day
),
daily_revenue AS (
SELECT order_date AS day, SUM(amount) AS revenue
FROM orders WHERE order_date >= '2025-01-01' GROUP BY 1
)
SELECT ds.day, COALESCE(dr.revenue, 0) AS revenue,
SUM(COALESCE(dr.revenue, 0)) OVER (ORDER BY ds.day) AS cumulative
FROM date_spine ds LEFT JOIN daily_revenue dr ON ds.day = dr.day;
Cohort Retention
WITH user_cohort AS (
SELECT user_id, date_trunc('month', MIN(created_at)) AS cohort_month
FROM users GROUP BY user_id
),
user_activity AS (
SELECT DISTINCT user_id, date_trunc('month', event_date) AS active_month
FROM events
)
SELECT uc.cohort_month,
EXTRACT(MONTH FROM AGE(ua.active_month, uc.cohort_month))::int AS months_since,
COUNT(DISTINCT ua.user_id) AS active_users,
ROUND(COUNT(DISTINCT ua.user_id) * 100.0 /
COUNT(DISTINCT ua.user_id) FILTER (
WHERE ua.active_month = uc.cohort_month), 1
) AS retention_pct
FROM user_cohort uc
JOIN user_activity ua USING (user_id)
GROUP BY 1, 2 ORDER BY 1, 2;
Year-over-Year Comparison
-- YoY revenue by month using LAG on yearly aggregation
WITH monthly AS (
SELECT date_trunc('month', order_date) AS month,
EXTRACT(YEAR FROM order_date) AS yr,
SUM(amount) AS revenue
FROM orders GROUP BY 1, 2
)
SELECT month, revenue,
LAG(revenue) OVER (PARTITION BY EXTRACT(MONTH FROM month) ORDER BY yr) AS prev_year,
ROUND((revenue - LAG(revenue) OVER (
PARTITION BY EXTRACT(MONTH FROM month) ORDER BY yr
)) * 100.0 / NULLIF(LAG(revenue) OVER (
PARTITION BY EXTRACT(MONTH FROM month) ORDER BY yr
), 0), 1) AS yoy_pct
FROM monthly ORDER BY month;
Performance: Making Queries Fast
EXPLAIN ANALYZE - Read the Plan
EXPLAIN (ANALYZE, BUFFERS, FORMAT TEXT)
SELECT * FROM orders
WHERE user_id = 12345
AND order_date >= '2025-01-01'
ORDER BY order_date DESC
LIMIT 10;
Key things to look for in the output:
- Seq Scan on large tables - needs an index
- Nested Loop with high row estimates - consider Hash Join
- Sort with high cost - add index matching ORDER BY
- actual time vs. estimated rows - large mismatches mean stale statistics (run ANALYZE)
Index Strategies
-- Composite index: matches WHERE + ORDER BY
CREATE INDEX idx_orders_user_date
ON orders (user_id, order_date DESC);
-- Partial index: only index what you query
CREATE INDEX idx_orders_active
ON orders (user_id, order_date)
WHERE status = 'active';
-- Covering index: includes all columns needed (index-only scan)
CREATE INDEX idx_orders_covering
ON orders (user_id, order_date DESC)
INCLUDE (amount, status);
Common Anti-Patterns
| Anti-Pattern | Why It's Slow | Fix |
|---|---|---|
SELECT * | Reads all columns, prevents index-only scans | Select only needed columns |
WHERE LOWER(email) = ... | Function on column prevents index use | Create expression index or use citext |
WHERE date::text LIKE '2025%' | Cast prevents index use | Use WHERE date >= '2025-01-01' |
NOT IN (subquery) | Poor performance with NULLs | Use NOT EXISTS instead |
| CTE as optimization fence | PostgreSQL <12 materializes all CTEs | Use subqueries for performance-critical paths |
ORDER BY RANDOM() | Full table scan + sort | Use TABLESAMPLE or pre-computed random column |
The 10 Essential Queries
These 10 queries cover ~80% of real-world business analysis. Master them and you can answer almost any question a stakeholder throws at you.
1. Cohort Retention
See the full query in the Date/Time Analysis section above.
2. Conversion Funnel
WITH funnel AS (
SELECT
COUNT(DISTINCT user_id) FILTER (WHERE event = 'signup') AS signups,
COUNT(DISTINCT user_id) FILTER (WHERE event = 'activation') AS activated,
COUNT(DISTINCT user_id) FILTER (WHERE event = 'first_purchase') AS purchased,
COUNT(DISTINCT user_id) FILTER (WHERE event = 'repeat_purchase') AS repeated
FROM user_events
WHERE event_date >= CURRENT_DATE - INTERVAL '30 days'
)
SELECT
signups,
activated, ROUND(activated * 100.0 / signups, 1) AS activation_rate,
purchased, ROUND(purchased * 100.0 / activated, 1) AS purchase_rate,
repeated, ROUND(repeated * 100.0 / purchased, 1) AS repeat_rate
FROM funnel;
3-5. Running Total, Moving Average, YoY
Covered in Window Functions and Date/Time Analysis above.
6. Percentile Calculation
SELECT
PERCENTILE_CONT(0.50) WITHIN GROUP (ORDER BY total_revenue) AS median,
PERCENTILE_CONT(0.90) WITHIN GROUP (ORDER BY total_revenue) AS p90,
PERCENTILE_CONT(0.95) WITHIN GROUP (ORDER BY total_revenue) AS p95,
PERCENTILE_CONT(0.99) WITHIN GROUP (ORDER BY total_revenue) AS p99
FROM (
SELECT user_id, SUM(amount) AS total_revenue
FROM orders GROUP BY user_id
) per_user;
7. Sessionization
WITH events_with_gap AS (
SELECT *,
EXTRACT(EPOCH FROM event_time - LAG(event_time) OVER (
PARTITION BY user_id ORDER BY event_time
)) / 60 AS minutes_since_last
FROM events
),
sessions AS (
SELECT *,
SUM(CASE WHEN minutes_since_last > 30 OR minutes_since_last IS NULL
THEN 1 ELSE 0 END) OVER (
PARTITION BY user_id ORDER BY event_time
) AS session_id
FROM events_with_gap
)
SELECT user_id, session_id,
MIN(event_time) AS session_start,
MAX(event_time) AS session_end,
COUNT(*) AS events_in_session
FROM sessions
GROUP BY user_id, session_id;
8. Deduplication
SELECT * FROM (
SELECT *, ROW_NUMBER() OVER (PARTITION BY email ORDER BY updated_at DESC) AS rn
FROM users_staging
) t WHERE rn = 1;
9. Gap Analysis
WITH gaps AS (
SELECT
order_id,
LEAD(order_id) OVER (ORDER BY order_id) AS next_id,
LEAD(order_id) OVER (ORDER BY order_id) - order_id AS gap_size
FROM orders
)
SELECT order_id AS gap_after, next_id AS gap_before, gap_size
FROM gaps WHERE gap_size > 1
ORDER BY gap_size DESC;
10. Cumulative Distribution
SELECT
user_id,
total_revenue,
CUME_DIST() OVER (ORDER BY total_revenue) AS revenue_percentile,
SUM(total_revenue) OVER (ORDER BY total_revenue DESC) * 100.0 /
SUM(total_revenue) OVER () AS cumulative_pct_of_total
FROM (
SELECT user_id, SUM(amount) AS total_revenue
FROM orders GROUP BY user_id
) per_user
ORDER BY total_revenue DESC;
The Bottom Line
SQL is the most underrated skill in data analysis. These 10 queries - built on window functions, CTEs, and conditional aggregation - cover the vast majority of business questions. Learn them by pattern, not by memorization. When a stakeholder asks "what's our retention?" you should think cohort + date_trunc + COUNT DISTINCT, not try to remember exact syntax. The syntax you can look up. The patterns are what make you fast.