football-api/scrapper.py

#!/usr/bin/env python3
"""
FC Porto scraper — zerozero.pt
Scrapes last 2 results and next 2 fixtures including TV channel.
Writes /data/porto.json
"""

import json
import logging
import re
from datetime import datetime
from pathlib import Path
from zoneinfo import ZoneInfo

import httpx
from bs4 import BeautifulSoup

logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
log = logging.getLogger(__name__)

DATA_FILE = Path("/data/porto.json")
DATA_FILE.parent.mkdir(parents=True, exist_ok=True)

PT_TZ = ZoneInfo("Europe/Lisbon")

HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
        "(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"
    ),
    "Accept-Language": "pt-PT,pt;q=0.9,en;q=0.8",
}

TV_CHANNELS = [
    "Sport TV 1", "Sport TV 2", "Sport TV 3", "Sport TV 4", "Sport TV+",
    "RTP 1", "RTP1", "SIC", "TVI", "Eleven Sports", "DAZN", "Canal+"
]

COMPETITIONS = {
    "liga portugal": "Liga Portugal",
    "primeira liga": "Liga Portugal",
    "champions league": "Champions League",
    "liga dos campeões": "Champions League",
    "europa league": "Europa League",
    "liga europa": "Europa League",
    "taça de portugal": "Taça de Portugal",
    "taca de portugal": "Taça de Portugal",
    "supertaça": "Supertaça",
}


def fetch(url: str) -> BeautifulSoup | None:
    try:
        r = httpx.get(url, headers=HEADERS, timeout=30, follow_redirects=True)
        r.raise_for_status()
        return BeautifulSoup(r.text, "html.parser")
    except Exception as e:
        log.error("Failed to fetch %s: %s", url, e)
        return None


def normalise_competition(raw: str) -> str:
    for key, val in COMPETITIONS.items():
        if key in raw.lower():
            return val
    return raw.strip().title() if raw else "Liga Portugal"


def find_channel(text: str) -> str:
    text_upper = text.upper()
    for ch in TV_CHANNELS:
        if ch.upper() in text_upper:
            return ch
    return "TBD"


def parse_score(text: str) -> str | None:
    m = re.search(r"(\d+)\s*[-–]\s*(\d+)", text)
    return f"{m.group(1)} - {m.group(2)}" if m else None


def parse_date(text: str) -> datetime | None:
    m = re.search(r"(\d{1,2})[/\-](\d{1,2})[/\-](\d{4})", text)
    if not m:
        return None
    tm = re.search(r"(\d{2}):(\d{2})", text)
    hour, minute = (int(tm.group(1)), int(tm.group(2))) if tm else (0, 0)
    try:
        return datetime(
            int(m.group(3)), int(m.group(2)), int(m.group(1)),
            hour, minute, tzinfo=PT_TZ
        )
    except ValueError:
        return None


def scrape_zerozero() -> list[dict]:
    log.info("Scraping zerozero.pt...")
    # Porto team page — jogos tab
    soup = fetch("https://www.zerozero.pt/equipa.php?id=8&epoca_id=165")
    if not soup:
        return []

    matches = []
    now = datetime.now(tz=PT_TZ)

    rows = soup.select("table tr.odd, table tr.even")
    log.info("Found %d rows", len(rows))

    for row in rows:
        cells = row.find_all("td")
        if len(cells) < 3:
            continue

        full_text = row.get_text(" ", strip=True)

        dt = parse_date(full_text)
        if not dt:
            continue

        # Teams — look for equipa links
        teams = []
        for a in row.find_all("a", href=True):
            if "equipa.php" in a["href"]:
                name = a.get_text(strip=True)
                if name and len(name) > 1:
                    teams.append(name)

        if len(teams) < 2:
            continue

        home, away = teams[0], teams[1]
        porto_names = {"fc porto", "porto", "f.c. porto"}
        is_home = home.lower() in porto_names
        opponent = away if is_home else home

        score = parse_score(full_text)
        is_past = dt < now

        # Competition
        comp_raw = ""
        for a in row.find_all("a", href=True):
            if "competicao" in a["href"] or "taça" in a.get_text().lower():
                comp_raw = a.get_text(strip=True)
                break
        competition = normalise_competition(comp_raw)

        channel = find_channel(full_text)

        matches.append({
            "home": home,
            "away": away,
            "opponent": opponent,
            "is_home": is_home,
            "competition": competition,
            "channel": channel,
            "date": dt.strftime("%d/%m/%y"),
            "time": dt.strftime("%H:%M") if dt.hour or dt.minute else "TBD",
            "timestamp": dt.timestamp(),
            "score": score,
            "is_past": is_past,
        })

    matches.sort(key=lambda x: x["timestamp"])
    return matches


def build_output(matches: list[dict]) -> dict:
    now = datetime.now(tz=PT_TZ)
    past = [m for m in matches if m["is_past"] and m.get("score")]
    future = [m for m in matches if not m["is_past"]]

    # last 2 results + next 2 fixtures
    last2 = past[-2:] if len(past) >= 2 else past
    next2 = future[:2] if len(future) >= 2 else future

    # pad if needed
    empty = {
        "home": "—", "away": "—", "opponent": "—", "is_home": True,
        "competition": "—", "channel": "—", "date": "—", "time": "—",
        "timestamp": 0, "score": None, "is_past": False,
    }
    while len(last2) < 2:
        last2.insert(0, empty)
    while len(next2) < 2:
        next2.append(empty)

    return {
        "updated_at": now.isoformat(),
        "team": "FC Porto",
        "display": last2 + next2,   # always 4 items: [past, past, future, future]
        "all_past": past,
        "all_future": future,
    }


def main():
    matches = scrape_zerozero()

    if not matches:
        log.warning("No matches found — keeping existing data")
        if DATA_FILE.exists():
            return
        matches = []

    output = build_output(matches)
    DATA_FILE.write_text(json.dumps(output, indent=2, ensure_ascii=False))
    log.info(
        "Wrote %d past + %d future matches",
        len(output["all_past"]), len(output["all_future"])
    )


if __name__ == "__main__":
    main()