Update scraper/scraper.py

2026-04-12 13:17:15 +00:00
parent e4906c0e2a
commit 9428088c13
1 changed files with 134 additions and 142 deletions
--- a/scraper/scraper.py
+++ b/scraper/scraper.py
@@ -1,12 +1,13 @@
 #!/usr/bin/env python3
 """
-FC Porto scraper — zerozero.pt
-Scrapes last 2 results and next 2 fixtures including TV channel.
+FC Porto fixture scraper — ESPN
+Server-side rendered, no JS needed, no API key required.
 Writes /data/porto.json
 """

 import json
 import logging
+import os
 import re
 from datetime import datetime
 from pathlib import Path
@@ -21,194 +22,185 @@ log = logging.getLogger(__name__)
 DATA_FILE = Path("/data/porto.json")
 DATA_FILE.parent.mkdir(parents=True, exist_ok=True)

-PT_TZ = ZoneInfo("Europe/Lisbon")
+TEAM_ID   = os.environ.get("ESPN_TEAM_ID", "437")
+TEAM_SLUG = os.environ.get("ESPN_TEAM_SLUG", "por.porto")
+TEAM_NAME = os.environ.get("TEAM_NAME", "FC Porto")
+PT_TZ     = ZoneInfo("Europe/Lisbon")
+
+COMP_MAP = {
+    "Portuguese Primeira Liga": "LP",
+    "UEFA Europa League":       "UE",
+    "UEFA Champions League":    "CL",
+    "Taca de Portugal":         "TP",
+    "Taça de Portugal":         "TP",
+    "FIFA Club World Cup":      "CW",
+    "UEFA Super Cup":           "SC",
+}

 HEADERS = {
-    "User-Agent": (
-        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
-        "(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"
-    ),
-    "Accept-Language": "pt-PT,pt;q=0.9,en;q=0.8",
-}
-
-TV_CHANNELS = [
-    "Sport TV 1", "Sport TV 2", "Sport TV 3", "Sport TV 4", "Sport TV+",
-    "RTP 1", "RTP1", "SIC", "TVI", "Eleven Sports", "DAZN", "Canal+"
-]
-
-COMPETITIONS = {
-    "liga portugal": "Liga Portugal",
-    "primeira liga": "Liga Portugal",
-    "champions league": "Champions League",
-    "liga dos campeões": "Champions League",
-    "europa league": "Europa League",
-    "liga europa": "Europa League",
-    "taça de portugal": "Taça de Portugal",
-    "taca de portugal": "Taça de Portugal",
-    "supertaça": "Supertaça",
+    "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
+                  "Chrome/124.0 Safari/537.36",
+    "Accept-Language": "en-GB,en;q=0.9",
+    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
 }


-def fetch(url: str) -> BeautifulSoup | None:
-    try:
-        r = httpx.get(url, headers=HEADERS, timeout=30, follow_redirects=True)
-        r.raise_for_status()
-        return BeautifulSoup(r.text, "html.parser")
-    except Exception as e:
-        log.error("Failed to fetch %s: %s", url, e)
-        return None
+def scrape() -> list[dict]:
+    log.info("Scraping ESPN for %s (id=%s)...", TEAM_NAME, TEAM_ID)

+    base      = "https://www.espn.com/soccer/team"
+    urls      = {
+        "future": f"{base}/fixtures/_/id/{TEAM_ID}/{TEAM_SLUG}",
+        "past":   f"{base}/results/_/id/{TEAM_ID}/{TEAM_SLUG}",
+    }

-def normalise_competition(raw: str) -> str:
-    for key, val in COMPETITIONS.items():
-        if key in raw.lower():
-            return val
-    return raw.strip().title() if raw else "Liga Portugal"
+    porto_names = {"fc porto", "porto"}
+    matches     = []

+    for mode, url in urls.items():
+        is_past = mode == "past"

-def find_channel(text: str) -> str:
-    text_upper = text.upper()
-    for ch in TV_CHANNELS:
-        if ch.upper() in text_upper:
-            return ch
-    return "TBD"
-
-
-def parse_score(text: str) -> str | None:
-    m = re.search(r"(\d+)\s*[-–]\s*(\d+)", text)
-    return f"{m.group(1)} - {m.group(2)}" if m else None
-
-
-def parse_date(text: str) -> datetime | None:
-    m = re.search(r"(\d{1,2})[/\-](\d{1,2})[/\-](\d{4})", text)
-    if not m:
-        return None
-    tm = re.search(r"(\d{2}):(\d{2})", text)
-    hour, minute = (int(tm.group(1)), int(tm.group(2))) if tm else (0, 0)
-    try:
-        return datetime(
-            int(m.group(3)), int(m.group(2)), int(m.group(1)),
-            hour, minute, tzinfo=PT_TZ
-        )
-    except ValueError:
-        return None
-
-
-def scrape_zerozero() -> list[dict]:
-    log.info("Scraping zerozero.pt...")
-    # Porto team page — jogos tab
-    soup = fetch("https://www.zerozero.pt/equipa.php?id=8&epoca_id=165")
-    if not soup:
-        return []
-
-    matches = []
-    now = datetime.now(tz=PT_TZ)
-
-    rows = soup.select("table tr.odd, table tr.even")
-    log.info("Found %d rows", len(rows))
-
-    for row in rows:
-        cells = row.find_all("td")
-        if len(cells) < 3:
+        try:
+            r = httpx.get(url, headers=HEADERS, timeout=30, follow_redirects=True)
+            r.raise_for_status()
+            soup = BeautifulSoup(r.text, "lxml")
+        except Exception as e:
+            log.error("Failed to fetch %s: %s", url, e)
            continue

-        full_text = row.get_text(" ", strip=True)
+        for table in soup.find_all("table"):
+            for row in table.find_all("tr"):
+                cells = row.find_all("td")
+                if len(cells) < 4:
+                    continue
+                try:
+                    # ── Date ──────────────────────────────────────────
+                    date_text = cells[0].get_text(strip=True)

-        dt = parse_date(full_text)
-        if not dt:
-            continue
+                    # ── Teams + logos ──────────────────────────────────
+                    match_cell = cells[1]
+                    team_links = match_cell.find_all(
+                        "a", href=re.compile(r"/soccer/team/_/id/\d+")
+                    )
+                    if len(team_links) < 2:
+                        continue

-        # Teams — look for equipa links
-        teams = []
-        for a in row.find_all("a", href=True):
-            if "equipa.php" in a["href"]:
-                name = a.get_text(strip=True)
-                if name and len(name) > 1:
-                    teams.append(name)
+                    home      = team_links[0].get_text(strip=True)
+                    away      = team_links[-1].get_text(strip=True)
+                    home_id   = re.search(r"/id/(\d+)/", team_links[0]["href"])
+                    away_id   = re.search(r"/id/(\d+)/", team_links[-1]["href"])
+                    home_logo = (
+                        f"https://a.espncdn.com/i/teamlogos/soccer/500/{home_id.group(1)}.png"
+                        if home_id else None
+                    )
+                    away_logo = (
+                        f"https://a.espncdn.com/i/teamlogos/soccer/500/{away_id.group(1)}.png"
+                        if away_id else None
+                    )

-        if len(teams) < 2:
-            continue
+                    is_home      = home.lower() in porto_names
+                    opponent     = away if is_home else home
+                    opponent_logo = away_logo if is_home else home_logo

-        home, away = teams[0], teams[1]
-        porto_names = {"fc porto", "porto", "f.c. porto"}
-        is_home = home.lower() in porto_names
-        opponent = away if is_home else home
+                    # ── Score / time ───────────────────────────────────
+                    score_cell = cells[3] if len(cells) > 3 else cells[2]
+                    score_text = score_cell.get_text(strip=True)
+                    score_str  = None
+                    time_str   = "TBD"

-        score = parse_score(full_text)
-        is_past = dt < now
+                    if is_past and re.match(r"\d+\s*-\s*\d+", score_text):
+                        score_str = score_text.replace(" ", "")
+                    elif not is_past:
+                        time_str = score_text if score_text not in ("", "TBD") else "TBD"

-        # Competition
-        comp_raw = ""
-        for a in row.find_all("a", href=True):
-            if "competicao" in a["href"] or "taça" in a.get_text().lower():
-                comp_raw = a.get_text(strip=True)
-                break
-        competition = normalise_competition(comp_raw)
+                    # ── Competition ────────────────────────────────────
+                    comp_raw = cells[5].get_text(strip=True) if len(cells) > 5 else "—"
+                    abbr     = COMP_MAP.get(comp_raw, comp_raw[:2].upper() if comp_raw != "—" else "—")

-        channel = find_channel(full_text)
+                    # ── Date parse ─────────────────────────────────────
+                    dt   = None
+                    year = datetime.now(PT_TZ).year
+                    for fmt in ("%a, %b %d %Y", "%A, %B %d %Y"):
+                        try:
+                            dt = datetime.strptime(f"{date_text} {year}", fmt)
+                            dt = dt.replace(tzinfo=PT_TZ)
+                            # Roll over year if needed
+                            if dt < datetime.now(PT_TZ).replace(month=1, day=1):
+                                dt = dt.replace(year=year + 1)
+                            break
+                        except ValueError:
+                            continue

-        matches.append({
-            "home": home,
-            "away": away,
-            "opponent": opponent,
-            "is_home": is_home,
-            "competition": competition,
-            "channel": channel,
-            "date": dt.strftime("%d/%m/%y"),
-            "time": dt.strftime("%H:%M") if dt.hour or dt.minute else "TBD",
-            "timestamp": dt.timestamp(),
-            "score": score,
-            "is_past": is_past,
-        })
+                    matches.append({
+                        "home":          home,
+                        "away":          away,
+                        "home_logo":     home_logo,
+                        "away_logo":     away_logo,
+                        "opponent":      opponent,
+                        "opponent_logo": opponent_logo,
+                        "is_home":       is_home,
+                        "competition":   comp_raw,
+                        "abbr":          abbr,
+                        "date":          dt.strftime("%d/%m/%y") if dt else date_text,
+                        "time":          time_str,
+                        "timestamp":     dt.timestamp() if dt else 0,
+                        "score":         score_str,
+                        "is_past":       is_past,
+                    })
+
+                except Exception as e:
+                    log.debug("Skipping row: %s", e)
+                    continue
+
+        log.info("  %s: parsed %d matches", mode, sum(1 for m in matches if m["is_past"] == is_past))

    matches.sort(key=lambda x: x["timestamp"])
+    log.info("Total: %d past + %d future", 
+             sum(1 for m in matches if m["is_past"]),
+             sum(1 for m in matches if not m["is_past"]))
    return matches


 def build_output(matches: list[dict]) -> dict:
-    now = datetime.now(tz=PT_TZ)
-    past = [m for m in matches if m["is_past"] and m.get("score")]
+    past   = [m for m in matches if m["is_past"] and m.get("score")]
    future = [m for m in matches if not m["is_past"]]

-    # last 2 results + next 2 fixtures
-    last2 = past[-2:] if len(past) >= 2 else past
+    last2 = past[-2:]  if len(past)   >= 2 else past
    next2 = future[:2] if len(future) >= 2 else future

-    # pad if needed
    empty = {
-        "home": "—", "away": "—", "opponent": "—", "is_home": True,
-        "competition": "—", "channel": "—", "date": "—", "time": "—",
-        "timestamp": 0, "score": None, "is_past": False,
+        "home": "—", "away": "—", "home_logo": None, "away_logo": None,
+        "opponent": "—", "opponent_logo": None, "is_home": True,
+        "competition": "—", "abbr": "—",
+        "date": "—", "time": "—", "timestamp": 0,
+        "score": None, "is_past": False,
    }
    while len(last2) < 2:
-        last2.insert(0, empty)
+        last2.insert(0, dict(empty))
    while len(next2) < 2:
-        next2.append(empty)
+        next2.append(dict(empty))

    return {
-        "updated_at": now.isoformat(),
-        "team": "FC Porto",
-        "display": last2 + next2,   # always 4 items: [past, past, future, future]
-        "all_past": past,
+        "updated_at": datetime.now(tz=PT_TZ).isoformat(),
+        "team":       TEAM_NAME,
+        "display":    last2 + next2,
+        "all_past":   past,
        "all_future": future,
    }


 def main():
-    matches = scrape_zerozero()
+    matches = scrape()

    if not matches:
-        log.warning("No matches found — keeping existing data")
+        log.warning("No matches found — keeping existing data if present")
        if DATA_FILE.exists():
            return
        matches = []

    output = build_output(matches)
    DATA_FILE.write_text(json.dumps(output, indent=2, ensure_ascii=False))
-    log.info(
-        "Wrote %d past + %d future matches",
-        len(output["all_past"]), len(output["all_future"])
-    )
+    log.info("Wrote %d total matches to %s", len(matches), DATA_FILE)


 if __name__ == "__main__":