"""Fetch CFTC COT (Commitment of Traders) data.

CFTC publishes weekly COT reports on cftc.gov. We use the Socrata Open Data
API (free, no key) for the legacy COT report:
  COT_ENDPOINT = "https://publicreporting.cftc.gov/resource/6dca-aqww.json"

The "current" report is published every Friday at 3:30 PM ET for the
Tuesday positions. We fetch the most recent report and parse it for our
universe.

This is a stub for now — we only need to know the report is available;
deep parsing comes in the data pipeline work (TODO §E).
"""
from __future__ import annotations

import json
import sys
import time
import urllib.parse
import urllib.request
from datetime import date, datetime, timedelta
from pathlib import Path

VAULT = Path(__file__).resolve().parents[2]
COT_DIR = VAULT / "05-data" / "cot"

# Socrata endpoint for the legacy COT report (all commodities combined)
COT_ENDPOINT = "https://publicreporting.cftc.gov/resource/6dca-aqww.json"

# Map our universe to CFTC market codes (these are the futures contracts
# the COT report covers, not the ETFs).
# Reference: https://www.cftc.gov/MarketReports/CommitmentsofTraders/HistoricalViewable/index.htm
COT_MARKETS = {
    # id -> CFTC market name filter
    "gc": "GOLD",
    "si": "SILVER",
    "hg": "COPPER",
    "cl": "CRUDE OIL, LIGHT SWEET - NEW YORK MERCANTILE EXCHANGE",
    "ng": "NATURAL GAS",
    "zc": "CORN",
    "zw": "WHEAT",
    "zs": "SOYBEANS",
    "es": "E-MINI S&P 500",
    "nq": "NASDAQ-100 (MINI)",
    "rty": "RUSSELL 2000 (MINI)",
    "6e": "EURO FX",
    "6j": "JAPANESE YEN",
    "6b": "BRITISH POUND",
    "6a": "AUSTRALIAN DOLLAR",
    "6c": "CANADIAN DOLLAR",
    "6s": "SWISS FRANC",
}


def fetch_latest_cot(market_substr: str, timeout: int = 15) -> dict | None:
    """Fetch the latest COT row for a market matching `market_substr`.

    Socrata dataset 6dca-aqww — legacy COT report, futures only.
    Order column: report_date_as_yyyy_mm_dd (not report_date).
    """
    # Socrata API: the LIKE wildcard % must appear in the URL as %25.
    # urlencode will encode our % to %25, so we put a raw % in the where
    # clause and let urlencode handle the encoding. Single quotes are also
    # encoded by urlencode.
    where = f"market_and_exchange_names like '%{market_substr}%'"
    params = {
        "$where": where,
        "$order": "report_date_as_yyyy_mm_dd DESC",
        "$limit": "1",
    }
    qs = urllib.parse.urlencode(params)
    url = f"{COT_ENDPOINT}?{qs}"
    try:
        with urllib.request.urlopen(url, timeout=timeout) as r:
            data = json.loads(r.read().decode("utf-8"))
        if data:
            return data[0]
    except Exception as e:
        print(f"  COT fetch error for {market_substr!r}: {e}", file=sys.stderr)
    return None


def fetch_universe_cot(instrument_ids: list[str] | None = None,
                       cache: bool = True) -> dict[str, dict | None]:
    """Fetch the latest COT for the universe (or a subset).

    Returns a dict: instrument_id -> COT row (or None on failure).
    """
    if instrument_ids is None:
        instrument_ids = list(COT_MARKETS.keys())
    out: dict[str, dict | None] = {}
    for iid in instrument_ids:
        mkt = COT_MARKETS.get(iid)
        if not mkt:
            out[iid] = None
            continue
        # Use a stable short token to avoid URL-quoting the full market name
        # The Socrata API accepts a short substring; we use the first word
        short = mkt.split(",")[0].split(" ")[0].upper()
        out[iid] = fetch_latest_cot(short)
    return out


def cot_summary(cot_row: dict | None) -> str:
    """Render a one-line summary from a COT row."""
    if not cot_row:
        return "no data"
    rd = cot_row.get("report_date_as_yyyy_mm_dd", "?")
    ncl = cot_row.get("noncomm_positions_long_all", "?")
    ncs = cot_row.get("noncomm_positions_short_all", "?")
    ncnet = None
    try:
        ncnet = int(ncl) - int(ncs)
    except (TypeError, ValueError):
        pass
    oi = cot_row.get("open_interest_all", "?")
    return f"date={rd}  noncomm net={ncnet}  OI={oi}"


if __name__ == "__main__":
    print("Fetching latest COT for headline instruments...")
    t0 = time.time()
    rows = fetch_universe_cot(["gc", "cl", "es", "6e", "6j", "ng", "zw"])
    print(f"  {len(rows)} instruments in {time.time()-t0:.1f}s")
    for iid, row in rows.items():
        print(f"  {iid:6s}  {cot_summary(row)}")
