football-api/scraper/scraper.py

#!/usr/bin/env python3
"""
FC Porto fixture scraper — ESPN
Server-side rendered, no JS needed, no API key required.
Writes /data/porto.json
"""

import json
import logging
import os
import re
from datetime import datetime
from pathlib import Path
from zoneinfo import ZoneInfo

import httpx
from bs4 import BeautifulSoup

logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
log = logging.getLogger(__name__)

DATA_FILE = Path("/data/porto.json")
DATA_FILE.parent.mkdir(parents=True, exist_ok=True)

TEAM_ID   = os.environ.get("ESPN_TEAM_ID", "437")
TEAM_SLUG = os.environ.get("ESPN_TEAM_SLUG", "por.porto")
TEAM_NAME = os.environ.get("TEAM_NAME", "FC Porto")
PT_TZ     = ZoneInfo("Europe/Lisbon")

COMP_MAP = {
    "Portuguese Primeira Liga": "LP",
    "UEFA Europa League":       "UE",
    "UEFA Champions League":    "CL",
    "Taca de Portugal":         "TP",
    "Taça de Portugal":         "TP",
    "FIFA Club World Cup":      "CW",
    "UEFA Super Cup":           "SC",
}

HEADERS = {
    "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
                  "Chrome/124.0 Safari/537.36",
    "Accept-Language": "en-GB,en;q=0.9",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
}


def scrape() -> list[dict]:
    log.info("Scraping ESPN for %s (id=%s)...", TEAM_NAME, TEAM_ID)

    # ESPN table structure:
    # cell[0] = DATE
    # cell[1] = HOME TEAM (link)
    # cell[2] = SCORE or TIME (link for past, text for future)
    # cell[3] = AWAY TEAM (link)
    # cell[4] = FT / result status  (past) | competition (future some)
    # cell[5] = COMPETITION

    base  = "https://www.espn.com/soccer/team"
    pages = {
        "future": f"{base}/fixtures/_/id/{TEAM_ID}/{TEAM_SLUG}",
        "past":   f"{base}/results/_/id/{TEAM_ID}/{TEAM_SLUG}",
    }

    porto_names = {"fc porto", "porto"}
    matches     = []

    for mode, url in pages.items():
        is_past = mode == "past"

        try:
            r = httpx.get(url, headers=HEADERS, timeout=30, follow_redirects=True)
            r.raise_for_status()
            soup = BeautifulSoup(r.text, "lxml")
        except Exception as e:
            log.error("Failed to fetch %s: %s", url, e)
            continue

        count = 0
        for table in soup.find_all("table"):
            for row in table.find_all("tr"):
                cells = row.find_all("td")
                if len(cells) < 5:
                    continue
                try:
                    # ── Date ──────────────────────────────────────
                    date_text = cells[0].get_text(strip=True)
                    if not date_text or date_text == "DATE":
                        continue

                    # ── Home team ──────────────────────────────────
                    home_link = cells[1].find("a", href=re.compile(r"/soccer/team/_/id/\d+"))
                    if not home_link:
                        continue
                    home     = home_link.get_text(strip=True)
                    home_m   = re.search(r"/id/(\d+)/", home_link["href"])
                    home_id  = home_m.group(1) if home_m else None
                    home_logo = f"https://a.espncdn.com/i/teamlogos/soccer/500/{home_id}.png" if home_id else None

                    # ── Score / time (cell[2]) ─────────────────────────────────────────
                    # Remove gameNote spans (e.g. "1st Leg", "2nd Leg", aggregate text)
                    for note in cells[2].find_all("span", class_="gameNote"):
                        note.decompose()
                    mid_text = cells[2].get_text(strip=True)

                    # ── Away team ──────────────────────────────────
                    away_link = cells[3].find("a", href=re.compile(r"/soccer/team/_/id/\d+"))
                    if not away_link:
                        continue
                    away     = away_link.get_text(strip=True)
                    away_m   = re.search(r"/id/(\d+)/", away_link["href"])
                    away_id  = away_m.group(1) if away_m else None
                    away_logo = f"https://a.espncdn.com/i/teamlogos/soccer/500/{away_id}.png" if away_id else None

                    # ── Competition ────────────────────────────────
                    comp_raw = cells[5].get_text(strip=True) if len(cells) > 5 else \
                               cells[4].get_text(strip=True)
                    # Strip "FT" from competition if it crept in
                    if comp_raw in ("FT", "AET", "Pen"):
                        comp_raw = cells[5].get_text(strip=True) if len(cells) > 5 else "—"
                    abbr = COMP_MAP.get(comp_raw, comp_raw[:2].upper() if comp_raw not in ("—", "") else "—")

                    # ── Porto perspective ──────────────────────────
                    is_home      = home.lower() in porto_names
                    opponent     = away if is_home else home
                    opponent_logo = away_logo if is_home else home_logo

                    # ── Score vs time ──────────────────────────────────────────────────
                    score_str = None
                    time_str  = "TBD"
                    if is_past:
                        # Extract just the score digits, ignore leg/aggregate text
                        score_match = re.match(r"^(\d{1,2})\s*-\s*(\d{1,2})", mid_text)
                        if score_match:
                            score_str = f"{score_match.group(1)}-{score_match.group(2)}"
                        else:
                            continue
                    else:
                        time_str = mid_text if mid_text not in ("", "TBD") else "TBD"

                    # ── Parse date ─────────────────────────────────────────────────────
                    dt   = None
                    now  = datetime.now(PT_TZ)
                    year = now.year

                    for fmt in ("%a, %b %d %Y", "%A, %B %d %Y"):
                        try:
                            dt = datetime.strptime(f"{date_text} {year}", fmt)
                            dt = dt.replace(tzinfo=PT_TZ)

                            if is_past:
                                if dt.date() > now.date():
                                    dt = dt.replace(year=year - 1)
                            else:
                                # Use date() comparison so today's unplayed games stay as future
                                if dt.date() < now.date():
                                    dt = dt.replace(year=year + 1)
                            break
                        except ValueError:
                            continue

                    matches.append({
                        "home":          home,
                        "away":          away,
                        "home_logo":     home_logo,
                        "away_logo":     away_logo,
                        "opponent":      opponent,
                        "opponent_logo": opponent_logo,
                        "is_home":       is_home,
                        "competition":   comp_raw,
                        "abbr":          abbr,
                        "date":          dt.strftime("%d/%m/%y") if dt else date_text,
                        "time":          time_str,
                        "timestamp":     dt.timestamp() if dt else 0,
                        "score":         score_str,
                        "is_past":       is_past,
                    })
                    count += 1

                except Exception as e:
                    log.debug("Skipping row: %s", e)
                    continue

        log.info("  %s: %d matches parsed", mode, count)

    matches.sort(key=lambda x: x["timestamp"])
    log.info("Total: %d past + %d future",
             sum(1 for m in matches if m["is_past"]),
             sum(1 for m in matches if not m["is_past"]))
    return matches


def build_output(matches: list[dict]) -> dict:
    past   = [m for m in matches if m["is_past"]]
    future = [m for m in matches if not m["is_past"]]

    last2 = past[-2:]  if len(past)   >= 2 else past
    next2 = future[:2] if len(future) >= 2 else future

    empty = {
        "home": "—", "away": "—", "home_logo": None, "away_logo": None,
        "opponent": "—", "opponent_logo": None, "is_home": True,
        "competition": "—", "abbr": "—",
        "date": "—", "time": "—", "timestamp": 0,
        "score": None, "is_past": False,
    }
    while len(last2) < 2:
        last2.insert(0, dict(empty))
    while len(next2) < 2:
        next2.append(dict(empty))

    return {
        "updated_at": datetime.now(tz=PT_TZ).isoformat(),
        "team":       TEAM_NAME,
        "display":    last2 + next2,
        "all_past":   past,
        "all_future": future,
    }


def main():
    matches = scrape()

    if not matches:
        log.warning("No matches found — keeping existing data if present")
        if DATA_FILE.exists():
            return
        matches = []

    output = build_output(matches)
    DATA_FILE.write_text(json.dumps(output, indent=2, ensure_ascii=False))
    log.info("Wrote %d total matches to %s", len(matches), DATA_FILE)


if __name__ == "__main__":
    main()