SQL é uma habilidade essencial para qualquer data scientist. Vamos explorar queries que você usará diariamente.
A query mais fundamental em SQL:
SELECT customer_id, SUM(amount) as total_spent
FROM orders
WHERE order_date >= '2026-01-01'
GROUP BY customer_id
HAVING SUM(amount) > 1000
ORDER BY total_spent DESC
LIMIT 10;
SELECT customer_id, SUM(amount) as total_spent
FROM orders
WHERE order_date >= '2026-01-01'
GROUP BY customer_id
HAVING SUM(amount) > 1000
ORDER BY total_spent DESC
LIMIT 10;
Componentes:
Retorna apenas registros que existem em ambas as tabelas.
SELECT
o.order_id,
c.customer_name,
o.amount
FROM orders o
INNER JOIN customers c ON o.customer_id = c.customer_id;
SELECT
o.order_id,
c.customer_name,
o.amount
FROM orders o
INNER JOIN customers c ON o.customer_id = c.customer_id;
Retorna todos os registros da tabela esquerda + matches da direita.
SELECT
c.customer_id,
c.customer_name,
COUNT(o.order_id) as num_orders
FROM customers c
LEFT JOIN orders o ON c.customer_id = o.customer_id
GROUP BY c.customer_id;
SELECT
c.customer_id,
c.customer_name,
COUNT(o.order_id) as num_orders
FROM customers c
LEFT JOIN orders o ON c.customer_id = o.customer_id
GROUP BY c.customer_id;
Retorna todos os registros de ambas as tabelas.
SELECT
COALESCE(c.customer_id, o.customer_id) as customer_id,
c.customer_name,
o.order_id
FROM customers c
FULL OUTER JOIN orders o ON c.customer_id = o.customer_id;
SELECT
COALESCE(c.customer_id, o.customer_id) as customer_id,
c.customer_name,
o.order_id
FROM customers c
FULL OUTER JOIN orders o ON c.customer_id = o.customer_id;
Funções que operam em um "janela" de linhas:
SELECT
customer_id,
order_date,
amount,
SUM(amount) OVER (
PARTITION BY customer_id
ORDER BY order_date
) as running_total,
ROW_NUMBER() OVER (
PARTITION BY customer_id
ORDER BY amount DESC
) as rank_by_amount
FROM orders;
SELECT
customer_id,
order_date,
amount,
SUM(amount) OVER (
PARTITION BY customer_id
ORDER BY order_date
) as running_total,
ROW_NUMBER() OVER (
PARTITION BY customer_id
ORDER BY amount DESC
) as rank_by_amount
FROM orders;
Queries nomeadas que podem ser referenciadas:
WITH customer_totals AS (
SELECT
customer_id,
SUM(amount) as total,
COUNT(*) as num_orders
FROM orders
GROUP BY customer_id
),
high_value_customers AS (
SELECT *
FROM customer_totals
WHERE total > 5000
)
SELECT * FROM high_value_customers
ORDER BY total DESC;
WITH customer_totals AS (
SELECT
customer_id,
SUM(amount) as total,
COUNT(*) as num_orders
FROM orders
GROUP BY customer_id
),
high_value_customers AS (
SELECT *
FROM customer_totals
WHERE total > 5000
)
SELECT * FROM high_value_customers
ORDER BY total DESC;
CTEs tornam queries complexas mais legíveis.
Índices aceleram buscas significativamente:
CREATE INDEX idx_customer_id ON orders(customer_id);
CREATE INDEX idx_order_date ON orders(order_date);
CREATE INDEX idx_customer_id ON orders(customer_id);
CREATE INDEX idx_order_date ON orders(order_date);
Selecione apenas colunas necessárias:
-- Ruim
SELECT * FROM orders;
-- Bom
SELECT order_id, customer_id, amount FROM orders;
-- Ruim
SELECT * FROM orders;
-- Bom
SELECT order_id, customer_id, amount FROM orders;
Entenda planos de execução:
EXPLAIN SELECT * FROM orders WHERE customer_id = 123;
EXPLAIN SELECT * FROM orders WHERE customer_id = 123;
Para tabelas com bilhões de linhas, particione por data:
CREATE TABLE orders_2026_01 PARTITION OF orders
FOR VALUES FROM ('2026-01-01') TO ('2026-02-01');
CREATE TABLE orders_2026_01 PARTITION OF orders
FOR VALUES FROM ('2026-01-01') TO ('2026-02-01');
SELECT
c.customer_id,
c.customer_name,
MAX(o.order_date) as last_order_date,
CURRENT_DATE - MAX(o.order_date) as days_inactive
FROM customers c
LEFT JOIN orders o ON c.customer_id = o.customer_id
GROUP BY c.customer_id
HAVING MAX(o.order_date) < CURRENT_DATE - INTERVAL '90 days'
ORDER BY days_inactive DESC;
SELECT
c.customer_id,
c.customer_name,
MAX(o.order_date) as last_order_date,
CURRENT_DATE - MAX(o.order_date) as days_inactive
FROM customers c
LEFT JOIN orders o ON c.customer_id = o.customer_id
GROUP BY c.customer_id
HAVING MAX(o.order_date) < CURRENT_DATE - INTERVAL '90 days'
ORDER BY days_inactive DESC;
WITH cohorts AS (
SELECT
customer_id,
DATE_TRUNC('month', first_order_date) as cohort_month,
DATE_TRUNC('month', order_date) as order_month
FROM (
SELECT
customer_id,
order_date,
MIN(order_date) OVER (PARTITION BY customer_id) as first_order_date
FROM orders
)
)
SELECT
cohort_month,
EXTRACT(MONTH FROM order_month - cohort_month) as months_since_first,
COUNT(DISTINCT customer_id) as num_customers
FROM cohorts
GROUP BY cohort_month, months_since_first
ORDER BY cohort_month, months_since_first;
WITH cohorts AS (
SELECT
customer_id,
DATE_TRUNC('month', first_order_date) as cohort_month,
DATE_TRUNC('month', order_date) as order_month
FROM (
SELECT
customer_id,
order_date,
MIN(order_date) OVER (PARTITION BY customer_id) as first_order_date
FROM orders
)
)
SELECT
cohort_month,
EXTRACT(MONTH FROM order_month - cohort_month) as months_since_first,
COUNT(DISTINCT customer_id) as num_customers
FROM cohorts
GROUP BY cohort_month, months_since_first
ORDER BY cohort_month, months_since_first;
WITH daily_stats AS (
SELECT
DATE(order_date) as order_day,
COUNT(*) as num_orders,
SUM(amount) as total_amount,
AVG(amount) as avg_amount
FROM orders
GROUP BY DATE(order_date)
),
stats_with_mean_std AS (
SELECT
*,
AVG(num_orders) OVER () as mean_orders,
STDDEV(num_orders) OVER () as std_orders
FROM daily_stats
)
SELECT
order_day,
num_orders,
CASE
WHEN ABS(num_orders - mean_orders) > 2 * std_orders THEN 'Anomaly'
ELSE 'Normal'
END as status
FROM stats_with_mean_std
ORDER BY order_day DESC;
WITH daily_stats AS (
SELECT
DATE(order_date) as order_day,
COUNT(*) as num_orders,
SUM(amount) as total_amount,
AVG(amount) as avg_amount
FROM orders
GROUP BY DATE(order_date)
),
stats_with_mean_std AS (
SELECT
*,
AVG(num_orders) OVER () as mean_orders,
STDDEV(num_orders) OVER () as std_orders
FROM daily_stats
)
SELECT
order_day,
num_orders,
CASE
WHEN ABS(num_orders - mean_orders) > 2 * std_orders THEN 'Anomaly'
ELSE 'Normal'
END as status
FROM stats_with_mean_std
ORDER BY order_day DESC;
SQL é poderoso — aprenda bem e você será muito mais produtivo. Pratique regularmente com dados reais e você dominará em pouco tempo!
Próximos passos: Explore NoSQL (MongoDB), data warehouses (BigQuery, Snowflake) e ferramentas de BI (Tableau, Power BI).