From 2f8e0a517a85e0aa0e4cc7badad4deb0d9971684 Mon Sep 17 00:00:00 2001
From: rgcosta <raul@rcosta.uk>
Date: Sun, 12 Apr 2026 13:27:16 +0000
Subject: [PATCH] Update scraper/scraper.py

---
 scraper/scraper.py | 106 ++++++++++++++++++++++++++-------------------
 1 file changed, 62 insertions(+), 44 deletions(-)

diff --git a/scraper/scraper.py b/scraper/scraper.py
index 105bdad..8ef07cd 100644
--- a/scraper/scraper.py
+++ b/scraper/scraper.py
@@ -48,8 +48,16 @@ HEADERS = {
 def scrape() -> list[dict]:
     log.info("Scraping ESPN for %s (id=%s)...", TEAM_NAME, TEAM_ID)
 
-    base      = "https://www.espn.com/soccer/team"
-    urls      = {
+    # ESPN table structure:
+    # cell[0] = DATE
+    # cell[1] = HOME TEAM (link)
+    # cell[2] = SCORE or TIME (link for past, text for future)
+    # cell[3] = AWAY TEAM (link)
+    # cell[4] = FT / result status  (past) | competition (future some)
+    # cell[5] = COMPETITION
+
+    base  = "https://www.espn.com/soccer/team"
+    pages = {
         "future": f"{base}/fixtures/_/id/{TEAM_ID}/{TEAM_SLUG}",
         "past":   f"{base}/results/_/id/{TEAM_ID}/{TEAM_SLUG}",
     }
@@ -57,7 +65,7 @@ def scrape() -> list[dict]:
     porto_names = {"fc porto", "porto"}
     matches     = []
 
-    for mode, url in urls.items():
+    for mode, url in pages.items():
         is_past = mode == "past"
 
         try:
@@ -68,64 +76,73 @@ def scrape() -> list[dict]:
             log.error("Failed to fetch %s: %s", url, e)
             continue
 
+        count = 0
         for table in soup.find_all("table"):
             for row in table.find_all("tr"):
                 cells = row.find_all("td")
-                if len(cells) < 4:
+                if len(cells) < 5:
                     continue
                 try:
-                    # ── Date ──────────────────────────────────────────
+                    # ── Date ──────────────────────────────────────
                     date_text = cells[0].get_text(strip=True)
-
-                    # ── Teams + logos ──────────────────────────────────
-                    match_cell = cells[1]
-                    team_links = match_cell.find_all(
-                        "a", href=re.compile(r"/soccer/team/_/id/\d+")
-                    )
-                    if len(team_links) < 2:
+                    if not date_text or date_text == "DATE":
                         continue
 
-                    home      = team_links[0].get_text(strip=True)
-                    away      = team_links[-1].get_text(strip=True)
-                    home_id   = re.search(r"/id/(\d+)/", team_links[0]["href"])
-                    away_id   = re.search(r"/id/(\d+)/", team_links[-1]["href"])
-                    home_logo = (
-                        f"https://a.espncdn.com/i/teamlogos/soccer/500/{home_id.group(1)}.png"
-                        if home_id else None
-                    )
-                    away_logo = (
-                        f"https://a.espncdn.com/i/teamlogos/soccer/500/{away_id.group(1)}.png"
-                        if away_id else None
-                    )
+                    # ── Home team ──────────────────────────────────
+                    home_link = cells[1].find("a", href=re.compile(r"/soccer/team/_/id/\d+"))
+                    if not home_link:
+                        continue
+                    home     = home_link.get_text(strip=True)
+                    home_m   = re.search(r"/id/(\d+)/", home_link["href"])
+                    home_id  = home_m.group(1) if home_m else None
+                    home_logo = f"https://a.espncdn.com/i/teamlogos/soccer/500/{home_id}.png" if home_id else None
 
+                    # ── Score / time (cell[2]) ─────────────────────
+                    mid_text = cells[2].get_text(strip=True)
+
+                    # ── Away team ──────────────────────────────────
+                    away_link = cells[3].find("a", href=re.compile(r"/soccer/team/_/id/\d+"))
+                    if not away_link:
+                        continue
+                    away     = away_link.get_text(strip=True)
+                    away_m   = re.search(r"/id/(\d+)/", away_link["href"])
+                    away_id  = away_m.group(1) if away_m else None
+                    away_logo = f"https://a.espncdn.com/i/teamlogos/soccer/500/{away_id}.png" if away_id else None
+
+                    # ── Competition ────────────────────────────────
+                    comp_raw = cells[5].get_text(strip=True) if len(cells) > 5 else \
+                               cells[4].get_text(strip=True)
+                    # Strip "FT" from competition if it crept in
+                    if comp_raw in ("FT", "AET", "Pen"):
+                        comp_raw = cells[5].get_text(strip=True) if len(cells) > 5 else "—"
+                    abbr = COMP_MAP.get(comp_raw, comp_raw[:2].upper() if comp_raw not in ("—", "") else "—")
+
+                    # ── Porto perspective ──────────────────────────
                     is_home      = home.lower() in porto_names
                     opponent     = away if is_home else home
                     opponent_logo = away_logo if is_home else home_logo
 
-                    # ── Score / time ───────────────────────────────────
-                    score_cell = cells[3] if len(cells) > 3 else cells[2]
-                    score_text = score_cell.get_text(strip=True)
-                    score_str  = None
-                    time_str   = "TBD"
+                    # ── Score vs time ──────────────────────────────
+                    score_str = None
+                    time_str  = "TBD"
+                    if is_past:
+                        if re.match(r"\d+\s*-\s*\d+", mid_text):
+                            score_str = mid_text.replace(" ", "")
+                        else:
+                            continue   # skip if no valid score
+                    else:
+                        time_str = mid_text if mid_text not in ("", "TBD") else "TBD"
 
-                    if is_past and re.match(r"\d+\s*-\s*\d+", score_text):
-                        score_str = score_text.replace(" ", "")
-                    elif not is_past:
-                        time_str = score_text if score_text not in ("", "TBD") else "TBD"
-
-                    # ── Competition ────────────────────────────────────
-                    comp_raw = cells[5].get_text(strip=True) if len(cells) > 5 else "—"
-                    abbr     = COMP_MAP.get(comp_raw, comp_raw[:2].upper() if comp_raw != "—" else "—")
-
-                    # ── Date parse ─────────────────────────────────────
+                    # ── Parse date ─────────────────────────────────
                     dt   = None
                     year = datetime.now(PT_TZ).year
                     for fmt in ("%a, %b %d %Y", "%A, %B %d %Y"):
                         try:
                             dt = datetime.strptime(f"{date_text} {year}", fmt)
                             dt = dt.replace(tzinfo=PT_TZ)
-                            # Roll over year if needed
-                            if dt < datetime.now(PT_TZ).replace(month=1, day=1):
+                            now = datetime.now(PT_TZ)
+                            # roll year forward for future dates that wrap
+                            if not is_past and dt.month < now.month - 2:
                                 dt = dt.replace(year=year + 1)
                             break
                         except ValueError:
@@ -147,22 +164,23 @@ def scrape() -> list[dict]:
                         "score":         score_str,
                         "is_past":       is_past,
                     })
+                    count += 1
 
                 except Exception as e:
                     log.debug("Skipping row: %s", e)
                     continue
 
-        log.info("  %s: parsed %d matches", mode, sum(1 for m in matches if m["is_past"] == is_past))
+        log.info("  %s: %d matches parsed", mode, count)
 
     matches.sort(key=lambda x: x["timestamp"])
-    log.info("Total: %d past + %d future", 
+    log.info("Total: %d past + %d future",
              sum(1 for m in matches if m["is_past"]),
              sum(1 for m in matches if not m["is_past"]))
     return matches
 
 
 def build_output(matches: list[dict]) -> dict:
-    past   = [m for m in matches if m["is_past"] and m.get("score")]
+    past   = [m for m in matches if m["is_past"]]
     future = [m for m in matches if not m["is_past"]]
 
     last2 = past[-2:]  if len(past)   >= 2 else past