Skip to content
SQL query results showing data analysis patterns

Master these 10 queries and you can answer almost any business question

All examples use PostgreSQL 16+ syntax. Most queries work on MySQL 8+ and SQLite 3.25+ with minor adjustments. Where PostgreSQL-specific features are used, alternatives are noted.

Window Functions: The Power Tool

Window functions operate on a set of rows related to the current row - without collapsing them into a single output row like GROUP BY does. They're the single most important SQL feature for data analysis.

ROW_NUMBER, RANK, DENSE_RANK

-- ROW_NUMBER: unique sequential numbers (great for dedup)
SELECT
    customer_id,
    order_date,
    amount,
    ROW_NUMBER() OVER (PARTITION BY customer_id ORDER BY order_date DESC) AS rn
FROM orders;
-- rn: 1, 2, 3, 4 (no ties)

-- RANK: same number for ties, skips next
SELECT
    product_id,
    revenue,
    RANK() OVER (ORDER BY revenue DESC) AS revenue_rank
FROM product_sales;
-- rank: 1, 2, 2, 4 (skips 3)

-- DENSE_RANK: same number for ties, no skip
SELECT
    product_id,
    revenue,
    DENSE_RANK() OVER (ORDER BY revenue DESC) AS dense_rnk
FROM product_sales;
-- dense_rnk: 1, 2, 2, 3 (no skip)

LAG / LEAD - Compare Adjacent Rows

SELECT
    date_trunc('month', order_date) AS month,
    SUM(amount) AS revenue,
    LAG(SUM(amount)) OVER (ORDER BY date_trunc('month', order_date)) AS prev_month,
    ROUND((SUM(amount) - LAG(SUM(amount)) OVER (ORDER BY date_trunc('month', order_date)))
        / LAG(SUM(amount)) OVER (ORDER BY date_trunc('month', order_date)) * 100, 1
    ) AS pct_change
FROM orders GROUP BY 1 ORDER BY 1;

Running Totals & Moving Averages

-- Running total
SELECT order_date, amount,
    SUM(amount) OVER (ORDER BY order_date ROWS UNBOUNDED PRECEDING) AS running_total
FROM orders;

-- 7-day moving average
SELECT day, daily_revenue,
    AVG(daily_revenue) OVER (ORDER BY day ROWS BETWEEN 6 PRECEDING AND CURRENT ROW) AS ma_7d
FROM (SELECT order_date AS day, SUM(amount) AS daily_revenue FROM orders GROUP BY 1) d;

Percent of Total

-- Each product's share of total revenue
SELECT
    product_name,
    SUM(amount) AS revenue,
    ROUND(
        SUM(amount) * 100.0 / SUM(SUM(amount)) OVER (), 2
    ) AS pct_of_total
FROM order_items
JOIN products USING (product_id)
GROUP BY product_name
ORDER BY revenue DESC;

CTEs: Readable, Composable Queries

Common Table Expressions let you break complex queries into named, readable steps. They're the difference between a 200-line nested subquery nightmare and something a human can actually understand.

Chaining CTEs

WITH active_users AS (
    SELECT user_id, COUNT(*) AS login_count
    FROM logins WHERE login_date >= CURRENT_DATE - INTERVAL '30 days'
    GROUP BY user_id HAVING COUNT(*) >= 3
),
user_revenue AS (
    SELECT u.user_id, u.login_count, COALESCE(SUM(o.amount), 0) AS revenue_30d
    FROM active_users u
    LEFT JOIN orders o ON o.user_id = u.user_id
        AND o.order_date >= CURRENT_DATE - INTERVAL '30 days'
    GROUP BY u.user_id, u.login_count
)
SELECT
    CASE WHEN revenue_30d >= 500 THEN 'whale'
         WHEN revenue_30d >= 100 THEN 'regular'
         WHEN revenue_30d > 0 THEN 'light' ELSE 'free' END AS segment,
    COUNT(*) AS users, ROUND(AVG(revenue_30d), 2) AS avg_revenue
FROM user_revenue GROUP BY 1 ORDER BY avg_revenue DESC;

Recursive CTEs - Hierarchies

-- Org chart: find all reports under a manager
WITH RECURSIVE org_tree AS (
    -- Base case: the manager
    SELECT employee_id, name, manager_id, 0 AS depth
    FROM employees
    WHERE employee_id = 42

    UNION ALL

    -- Recursive case: their reports
    SELECT e.employee_id, e.name, e.manager_id, t.depth + 1
    FROM employees e
    JOIN org_tree t ON e.manager_id = t.employee_id
    WHERE t.depth < 10  -- safety limit
)
SELECT * FROM org_tree ORDER BY depth, name;

Advanced Aggregations

GROUPING SETS & ROLLUP

SELECT
    COALESCE(region, '(All Regions)') AS region,
    COALESCE(product_category, '(All Products)') AS category,
    SUM(amount) AS revenue, COUNT(*) AS orders
FROM orders JOIN products USING (product_id)
GROUP BY GROUPING SETS (
    (region, product_category),
    (region), (product_category), ()
)
ORDER BY region NULLS LAST, category NULLS LAST;

-- ROLLUP: hierarchical subtotals (year → quarter → month)
SELECT
    EXTRACT(YEAR FROM order_date) AS yr,
    EXTRACT(QUARTER FROM order_date) AS qtr,
    SUM(amount) AS revenue
FROM orders
GROUP BY ROLLUP (EXTRACT(YEAR FROM order_date), EXTRACT(QUARTER FROM order_date))
ORDER BY yr NULLS LAST, qtr NULLS LAST;

FILTER - Conditional Aggregation

SELECT date_trunc('month', created_at) AS month,
    COUNT(*) AS total_users,
    COUNT(*) FILTER (WHERE plan = 'pro') AS pro_users,
    COUNT(*) FILTER (WHERE plan = 'free') AS free_users,
    ROUND(COUNT(*) FILTER (WHERE plan = 'pro') * 100.0 / COUNT(*), 1) AS pro_pct
FROM users GROUP BY 1 ORDER BY 1;

Pivoting Data

-- Pivot: rows to columns using conditional aggregation
SELECT product_category,
    SUM(amount) FILTER (WHERE EXTRACT(QUARTER FROM order_date) = 1) AS q1,
    SUM(amount) FILTER (WHERE EXTRACT(QUARTER FROM order_date) = 2) AS q2,
    SUM(amount) FILTER (WHERE EXTRACT(QUARTER FROM order_date) = 3) AS q3,
    SUM(amount) FILTER (WHERE EXTRACT(QUARTER FROM order_date) = 4) AS q4,
    SUM(amount) AS total
FROM orders JOIN products USING (product_id)
WHERE order_date >= '2025-01-01'
GROUP BY 1 ORDER BY total DESC;

-- Dynamic pivot with JSONB
SELECT user_id, jsonb_object_agg(event_type, event_count) AS events
FROM (
    SELECT user_id, event_type, COUNT(*) AS event_count
    FROM events GROUP BY user_id, event_type
) sub GROUP BY user_id;

Date/Time Analysis

Date Spine - Fill Missing Days

WITH date_spine AS (
    SELECT generate_series('2025-01-01'::date, '2025-12-31'::date, '1 day')::date AS day
),
daily_revenue AS (
    SELECT order_date AS day, SUM(amount) AS revenue
    FROM orders WHERE order_date >= '2025-01-01' GROUP BY 1
)
SELECT ds.day, COALESCE(dr.revenue, 0) AS revenue,
    SUM(COALESCE(dr.revenue, 0)) OVER (ORDER BY ds.day) AS cumulative
FROM date_spine ds LEFT JOIN daily_revenue dr ON ds.day = dr.day;

Cohort Retention

WITH user_cohort AS (
    SELECT user_id, date_trunc('month', MIN(created_at)) AS cohort_month
    FROM users GROUP BY user_id
),
user_activity AS (
    SELECT DISTINCT user_id, date_trunc('month', event_date) AS active_month
    FROM events
)
SELECT uc.cohort_month,
    EXTRACT(MONTH FROM AGE(ua.active_month, uc.cohort_month))::int AS months_since,
    COUNT(DISTINCT ua.user_id) AS active_users,
    ROUND(COUNT(DISTINCT ua.user_id) * 100.0 /
        COUNT(DISTINCT ua.user_id) FILTER (
            WHERE ua.active_month = uc.cohort_month), 1
    ) AS retention_pct
FROM user_cohort uc
JOIN user_activity ua USING (user_id)
GROUP BY 1, 2 ORDER BY 1, 2;

Year-over-Year Comparison

-- YoY revenue by month using LAG on yearly aggregation
WITH monthly AS (
    SELECT date_trunc('month', order_date) AS month,
           EXTRACT(YEAR FROM order_date) AS yr,
           SUM(amount) AS revenue
    FROM orders GROUP BY 1, 2
)
SELECT month, revenue,
    LAG(revenue) OVER (PARTITION BY EXTRACT(MONTH FROM month) ORDER BY yr) AS prev_year,
    ROUND((revenue - LAG(revenue) OVER (
        PARTITION BY EXTRACT(MONTH FROM month) ORDER BY yr
    )) * 100.0 / NULLIF(LAG(revenue) OVER (
        PARTITION BY EXTRACT(MONTH FROM month) ORDER BY yr
    ), 0), 1) AS yoy_pct
FROM monthly ORDER BY month;

Performance: Making Queries Fast

EXPLAIN ANALYZE - Read the Plan

EXPLAIN (ANALYZE, BUFFERS, FORMAT TEXT)
SELECT * FROM orders
WHERE user_id = 12345
  AND order_date >= '2025-01-01'
ORDER BY order_date DESC
LIMIT 10;

Key things to look for in the output:

  • Seq Scan on large tables - needs an index
  • Nested Loop with high row estimates - consider Hash Join
  • Sort with high cost - add index matching ORDER BY
  • actual time vs. estimated rows - large mismatches mean stale statistics (run ANALYZE)

Index Strategies

-- Composite index: matches WHERE + ORDER BY
CREATE INDEX idx_orders_user_date
ON orders (user_id, order_date DESC);

-- Partial index: only index what you query
CREATE INDEX idx_orders_active
ON orders (user_id, order_date)
WHERE status = 'active';

-- Covering index: includes all columns needed (index-only scan)
CREATE INDEX idx_orders_covering
ON orders (user_id, order_date DESC)
INCLUDE (amount, status);

Common Anti-Patterns

Anti-PatternWhy It's SlowFix
SELECT *Reads all columns, prevents index-only scansSelect only needed columns
WHERE LOWER(email) = ...Function on column prevents index useCreate expression index or use citext
WHERE date::text LIKE '2025%'Cast prevents index useUse WHERE date >= '2025-01-01'
NOT IN (subquery)Poor performance with NULLsUse NOT EXISTS instead
CTE as optimization fencePostgreSQL <12 materializes all CTEsUse subqueries for performance-critical paths
ORDER BY RANDOM()Full table scan + sortUse TABLESAMPLE or pre-computed random column

The 10 Essential Queries

These 10 queries cover ~80% of real-world business analysis. Master them and you can answer almost any question a stakeholder throws at you.

1. Cohort Retention

See the full query in the Date/Time Analysis section above.

2. Conversion Funnel

WITH funnel AS (
    SELECT
        COUNT(DISTINCT user_id) FILTER (WHERE event = 'signup') AS signups,
        COUNT(DISTINCT user_id) FILTER (WHERE event = 'activation') AS activated,
        COUNT(DISTINCT user_id) FILTER (WHERE event = 'first_purchase') AS purchased,
        COUNT(DISTINCT user_id) FILTER (WHERE event = 'repeat_purchase') AS repeated
    FROM user_events
    WHERE event_date >= CURRENT_DATE - INTERVAL '30 days'
)
SELECT
    signups,
    activated, ROUND(activated * 100.0 / signups, 1) AS activation_rate,
    purchased, ROUND(purchased * 100.0 / activated, 1) AS purchase_rate,
    repeated, ROUND(repeated * 100.0 / purchased, 1) AS repeat_rate
FROM funnel;

3-5. Running Total, Moving Average, YoY

Covered in Window Functions and Date/Time Analysis above.

6. Percentile Calculation

SELECT
    PERCENTILE_CONT(0.50) WITHIN GROUP (ORDER BY total_revenue) AS median,
    PERCENTILE_CONT(0.90) WITHIN GROUP (ORDER BY total_revenue) AS p90,
    PERCENTILE_CONT(0.95) WITHIN GROUP (ORDER BY total_revenue) AS p95,
    PERCENTILE_CONT(0.99) WITHIN GROUP (ORDER BY total_revenue) AS p99
FROM (
    SELECT user_id, SUM(amount) AS total_revenue
    FROM orders GROUP BY user_id
) per_user;

7. Sessionization

WITH events_with_gap AS (
    SELECT *,
        EXTRACT(EPOCH FROM event_time - LAG(event_time) OVER (
            PARTITION BY user_id ORDER BY event_time
        )) / 60 AS minutes_since_last
    FROM events
),
sessions AS (
    SELECT *,
        SUM(CASE WHEN minutes_since_last > 30 OR minutes_since_last IS NULL
            THEN 1 ELSE 0 END) OVER (
            PARTITION BY user_id ORDER BY event_time
        ) AS session_id
    FROM events_with_gap
)
SELECT user_id, session_id,
    MIN(event_time) AS session_start,
    MAX(event_time) AS session_end,
    COUNT(*) AS events_in_session
FROM sessions
GROUP BY user_id, session_id;

8. Deduplication

SELECT * FROM (
    SELECT *, ROW_NUMBER() OVER (PARTITION BY email ORDER BY updated_at DESC) AS rn
    FROM users_staging
) t WHERE rn = 1;

9. Gap Analysis

WITH gaps AS (
    SELECT
        order_id,
        LEAD(order_id) OVER (ORDER BY order_id) AS next_id,
        LEAD(order_id) OVER (ORDER BY order_id) - order_id AS gap_size
    FROM orders
)
SELECT order_id AS gap_after, next_id AS gap_before, gap_size
FROM gaps WHERE gap_size > 1
ORDER BY gap_size DESC;

10. Cumulative Distribution

SELECT
    user_id,
    total_revenue,
    CUME_DIST() OVER (ORDER BY total_revenue) AS revenue_percentile,
    SUM(total_revenue) OVER (ORDER BY total_revenue DESC) * 100.0 /
        SUM(total_revenue) OVER () AS cumulative_pct_of_total
FROM (
    SELECT user_id, SUM(amount) AS total_revenue
    FROM orders GROUP BY user_id
) per_user
ORDER BY total_revenue DESC;

The Bottom Line

SQL is the most underrated skill in data analysis. These 10 queries - built on window functions, CTEs, and conditional aggregation - cover the vast majority of business questions. Learn them by pattern, not by memorization. When a stakeholder asks "what's our retention?" you should think cohort + date_trunc + COUNT DISTINCT, not try to remember exact syntax. The syntax you can look up. The patterns are what make you fast.