Database Queries

This guide provides useful SQL queries for inspecting and working with the financial data extractor database.

Accessing the Database

# Connect via psql using Infrastructure Makefile
cd infrastructure
make psql

Using db_manager.py

# List all companies
cd backend
make db-list-companies

# Show database info
make db-info

Migration Queries

Check Applied Migrations

# Using Makefile (from backend/)
cd backend
make migrate-history

# Or via SQL
cd ../infrastructure
make psql
# Then run: SELECT * FROM alembic_version;
SELECT * FROM alembic_version;

Company Queries

View All Companies

# Using Makefile (recommended)
cd backend
make db-list-companies

Or via SQL:

SELECT
    id,
    name,
    primary_ticker,
    tickers,
    ir_url,
    created_at
FROM companies
ORDER BY name;

View All Companies with Ticker Details

# Using Makefile (recommended)
cd backend
make db-list-companies

Or via SQL:

SELECT
    c.id,
    c.name,
    c.primary_ticker,
    jsonb_array_elements(c.tickers) AS ticker_details,
    c.ir_url
FROM companies c
ORDER BY c.name;

Query Companies by Ticker (Search All Exchanges)

Find company by any ticker symbol across all exchanges:

-- Option 1: Using primary_ticker or JSONB contains
SELECT
    c.id,
    c.name,
    c.primary_ticker,
    c.tickers,
    c.ir_url
FROM companies c
WHERE c.primary_ticker = 'AZN'
   OR c.tickers @> '[{"ticker": "AZN"}]'::jsonb
   OR EXISTS (
       SELECT 1
       FROM jsonb_array_elements(c.tickers) AS ticker_obj
       WHERE ticker_obj->>'ticker' = 'AZN'
   );
-- Option 2: Using jsonb_array_elements
SELECT
    c.id,
    c.name,
    c.primary_ticker,
    c.tickers,
    ticker_elem->>'ticker' as matching_ticker,
    ticker_elem->>'exchange' as exchange,
    c.ir_url
FROM companies c,
     jsonb_array_elements(c.tickers) AS ticker_elem
WHERE ticker_elem->>'ticker' = 'AZN';

View All Tickers for a Company

SELECT
    c.name,
    c.primary_ticker,
    jsonb_array_elements(c.tickers) AS ticker_info
FROM companies c
WHERE c.name = 'Unilever PLC';

Document Queries

View All Documents

SELECT
    d.id,
    c.name as company_name,
    d.url,
    d.fiscal_year,
    d.document_type,
    d.file_path,
    d.created_at
FROM documents d
JOIN companies c ON c.id = d.company_id
ORDER BY c.name, d.fiscal_year DESC;

Documents by Company

SELECT
    c.name,
    c.primary_ticker,
    d.fiscal_year,
    d.document_type,
    d.url,
    d.file_path
FROM companies c
JOIN documents d ON d.company_id = c.id
WHERE c.primary_ticker = 'AZN'
ORDER BY d.fiscal_year DESC;

Extraction Queries

View Extractions for a Company

SELECT
    c.name,
    c.primary_ticker,
    d.fiscal_year,
    d.document_type,
    e.statement_type,
    e.raw_data->>'currency' as currency,
    e.created_at as extraction_date
FROM companies c
JOIN documents d ON d.company_id = c.id
JOIN extractions e ON e.document_id = d.id
WHERE c.primary_ticker = 'AZN'
ORDER BY d.fiscal_year DESC, e.statement_type;

Count Extractions per Company

SELECT
    c.name,
    c.primary_ticker,
    COUNT(e.id) as total_extractions,
    COUNT(DISTINCT d.fiscal_year) as years_with_data
FROM companies c
LEFT JOIN documents d ON d.company_id = c.id
LEFT JOIN extractions e ON e.document_id = d.id
GROUP BY c.id, c.name, c.primary_ticker
ORDER BY total_extractions DESC;

View Extraction Details

SELECT
    e.id,
    c.name as company_name,
    d.fiscal_year,
    e.statement_type,
    jsonb_array_length(e.raw_data->'line_items') as num_line_items,
    e.raw_data->>'currency' as currency,
    e.created_at
FROM extractions e
JOIN documents d ON d.id = e.document_id
JOIN companies c ON c.id = d.company_id
ORDER BY e.created_at DESC
LIMIT 10;

Compiled Statements Queries

View Compiled Statement

SELECT
    c.name,
    c.primary_ticker,
    cs.statement_type,
    jsonb_array_length(cs.data->'line_items') as num_line_items,
    cs.data->>'currency' as currency,
    cs.updated_at
FROM companies c
JOIN compiled_statements cs ON cs.company_id = c.id
WHERE c.primary_ticker = 'AZN'
  AND cs.statement_type = 'income_statement';

View All Compiled Statements Summary

SELECT
    c.name,
    c.primary_ticker,
    cs.statement_type,
    jsonb_array_length(cs.data->'line_items') as num_line_items,
    cs.updated_at,
    CASE
        WHEN cs.updated_at > NOW() - INTERVAL '7 days' THEN 'Recent'
        ELSE 'Stale'
    END as freshness
FROM companies c
JOIN compiled_statements cs ON cs.company_id = c.id
ORDER BY c.name, cs.statement_type;

Get Line Items from Compiled Statement

SELECT
    c.name,
    cs.statement_type,
    jsonb_array_elements(cs.data->'line_items') AS line_item
FROM companies c
JOIN compiled_statements cs ON cs.company_id = c.id
WHERE c.primary_ticker = 'AZN'
  AND cs.statement_type = 'income_statement'
LIMIT 5;

Statistics Queries

Database Statistics

Get overview of database contents:

SELECT
    'Companies' as table_name,
    COUNT(*) as count
FROM companies
UNION ALL
SELECT
    'Documents',
    COUNT(*)
FROM documents
UNION ALL
SELECT
    'Extractions',
    COUNT(*)
FROM extractions
UNION ALL
SELECT
    'Compiled Statements',
    COUNT(*)
FROM compiled_statements;

Companies with Most Documents

SELECT
    c.name,
    c.primary_ticker,
    COUNT(DISTINCT d.id) as document_count,
    MIN(d.fiscal_year) as earliest_year,
    MAX(d.fiscal_year) as latest_year,
    COUNT(DISTINCT e.id) as extraction_count
FROM companies c
LEFT JOIN documents d ON d.company_id = c.id
LEFT JOIN extractions e ON e.document_id = d.id
GROUP BY c.id, c.name, c.primary_ticker
ORDER BY document_count DESC;

Statement Types Distribution

SELECT
    statement_type,
    COUNT(*) as count,
    COUNT(DISTINCT document_id) as document_count,
    COUNT(DISTINCT (
        SELECT company_id
        FROM documents
        WHERE id = extractions.document_id
    )) as company_count
FROM extractions
GROUP BY statement_type
ORDER BY count DESC;

JSONB Queries

Query JSONB Tickers

-- Find companies with ticker on specific exchange
SELECT
    c.name,
    c.primary_ticker,
    ticker_elem->>'ticker' as ticker,
    ticker_elem->>'exchange' as exchange
FROM companies c,
     jsonb_array_elements(c.tickers) AS ticker_elem
WHERE ticker_elem->>'exchange' = 'LSE';

Extract Values from Extraction JSONB

-- Get revenue values from income statement extractions
SELECT
    c.name,
    d.fiscal_year,
    line_item->>'name' as line_item_name,
    line_item->'values'->>'2024' as value_2024
FROM companies c
JOIN documents d ON d.company_id = c.id
JOIN extractions e ON e.document_id = d.id,
     jsonb_array_elements(e.raw_data->'line_items') AS line_item
WHERE e.statement_type = 'income_statement'
  AND line_item->>'name' ILIKE '%revenue%'
ORDER BY c.name, d.fiscal_year DESC;

Query Compiled Statement Data

-- Get all line items with their values
SELECT
    c.name,
    cs.statement_type,
    line_item->>'name' as line_item_name,
    line_item->'values' as year_values,
    line_item->>'is_total' as is_total
FROM companies c
JOIN compiled_statements cs ON cs.company_id = c.id,
     jsonb_array_elements(cs.data->'line_items') AS line_item
WHERE c.primary_ticker = 'AZN'
  AND cs.statement_type = 'income_statement'
ORDER BY (line_item->>'indentation_level')::int, line_item->>'name';

Performance Queries

Check Index Usage

SELECT
    schemaname,
    tablename,
    indexname,
    idx_scan as index_scans,
    idx_tup_read as tuples_read,
    idx_tup_fetch as tuples_fetched
FROM pg_stat_user_indexes
WHERE schemaname = 'public'
ORDER BY idx_scan DESC;

Table Sizes

SELECT
    schemaname,
    tablename,
    pg_size_pretty(pg_total_relation_size(schemaname||'.'||tablename)) AS size,
    pg_total_relation_size(schemaname||'.'||tablename) AS size_bytes
FROM pg_tables
WHERE schemaname = 'public'
ORDER BY pg_total_relation_size(schemaname||'.'||tablename) DESC;