From 2f8e0a517a85e0aa0e4cc7badad4deb0d9971684 Mon Sep 17 00:00:00 2001 From: rgcosta Date: Sun, 12 Apr 2026 13:27:16 +0000 Subject: [PATCH] Update scraper/scraper.py --- scraper/scraper.py | 106 ++++++++++++++++++++++++++------------------- 1 file changed, 62 insertions(+), 44 deletions(-) diff --git a/scraper/scraper.py b/scraper/scraper.py index 105bdad..8ef07cd 100644 --- a/scraper/scraper.py +++ b/scraper/scraper.py @@ -48,8 +48,16 @@ HEADERS = { def scrape() -> list[dict]: log.info("Scraping ESPN for %s (id=%s)...", TEAM_NAME, TEAM_ID) - base = "https://www.espn.com/soccer/team" - urls = { + # ESPN table structure: + # cell[0] = DATE + # cell[1] = HOME TEAM (link) + # cell[2] = SCORE or TIME (link for past, text for future) + # cell[3] = AWAY TEAM (link) + # cell[4] = FT / result status (past) | competition (future some) + # cell[5] = COMPETITION + + base = "https://www.espn.com/soccer/team" + pages = { "future": f"{base}/fixtures/_/id/{TEAM_ID}/{TEAM_SLUG}", "past": f"{base}/results/_/id/{TEAM_ID}/{TEAM_SLUG}", } @@ -57,7 +65,7 @@ def scrape() -> list[dict]: porto_names = {"fc porto", "porto"} matches = [] - for mode, url in urls.items(): + for mode, url in pages.items(): is_past = mode == "past" try: @@ -68,64 +76,73 @@ def scrape() -> list[dict]: log.error("Failed to fetch %s: %s", url, e) continue + count = 0 for table in soup.find_all("table"): for row in table.find_all("tr"): cells = row.find_all("td") - if len(cells) < 4: + if len(cells) < 5: continue try: - # ── Date ────────────────────────────────────────── + # ── Date ────────────────────────────────────── date_text = cells[0].get_text(strip=True) - - # ── Teams + logos ────────────────────────────────── - match_cell = cells[1] - team_links = match_cell.find_all( - "a", href=re.compile(r"/soccer/team/_/id/\d+") - ) - if len(team_links) < 2: + if not date_text or date_text == "DATE": continue - home = team_links[0].get_text(strip=True) - away = team_links[-1].get_text(strip=True) - home_id = re.search(r"/id/(\d+)/", team_links[0]["href"]) - away_id = re.search(r"/id/(\d+)/", team_links[-1]["href"]) - home_logo = ( - f"https://a.espncdn.com/i/teamlogos/soccer/500/{home_id.group(1)}.png" - if home_id else None - ) - away_logo = ( - f"https://a.espncdn.com/i/teamlogos/soccer/500/{away_id.group(1)}.png" - if away_id else None - ) + # ── Home team ────────────────────────────────── + home_link = cells[1].find("a", href=re.compile(r"/soccer/team/_/id/\d+")) + if not home_link: + continue + home = home_link.get_text(strip=True) + home_m = re.search(r"/id/(\d+)/", home_link["href"]) + home_id = home_m.group(1) if home_m else None + home_logo = f"https://a.espncdn.com/i/teamlogos/soccer/500/{home_id}.png" if home_id else None + # ── Score / time (cell[2]) ───────────────────── + mid_text = cells[2].get_text(strip=True) + + # ── Away team ────────────────────────────────── + away_link = cells[3].find("a", href=re.compile(r"/soccer/team/_/id/\d+")) + if not away_link: + continue + away = away_link.get_text(strip=True) + away_m = re.search(r"/id/(\d+)/", away_link["href"]) + away_id = away_m.group(1) if away_m else None + away_logo = f"https://a.espncdn.com/i/teamlogos/soccer/500/{away_id}.png" if away_id else None + + # ── Competition ──────────────────────────────── + comp_raw = cells[5].get_text(strip=True) if len(cells) > 5 else \ + cells[4].get_text(strip=True) + # Strip "FT" from competition if it crept in + if comp_raw in ("FT", "AET", "Pen"): + comp_raw = cells[5].get_text(strip=True) if len(cells) > 5 else "—" + abbr = COMP_MAP.get(comp_raw, comp_raw[:2].upper() if comp_raw not in ("—", "") else "—") + + # ── Porto perspective ────────────────────────── is_home = home.lower() in porto_names opponent = away if is_home else home opponent_logo = away_logo if is_home else home_logo - # ── Score / time ─────────────────────────────────── - score_cell = cells[3] if len(cells) > 3 else cells[2] - score_text = score_cell.get_text(strip=True) - score_str = None - time_str = "TBD" + # ── Score vs time ────────────────────────────── + score_str = None + time_str = "TBD" + if is_past: + if re.match(r"\d+\s*-\s*\d+", mid_text): + score_str = mid_text.replace(" ", "") + else: + continue # skip if no valid score + else: + time_str = mid_text if mid_text not in ("", "TBD") else "TBD" - if is_past and re.match(r"\d+\s*-\s*\d+", score_text): - score_str = score_text.replace(" ", "") - elif not is_past: - time_str = score_text if score_text not in ("", "TBD") else "TBD" - - # ── Competition ──────────────────────────────────── - comp_raw = cells[5].get_text(strip=True) if len(cells) > 5 else "—" - abbr = COMP_MAP.get(comp_raw, comp_raw[:2].upper() if comp_raw != "—" else "—") - - # ── Date parse ───────────────────────────────────── + # ── Parse date ───────────────────────────────── dt = None year = datetime.now(PT_TZ).year for fmt in ("%a, %b %d %Y", "%A, %B %d %Y"): try: dt = datetime.strptime(f"{date_text} {year}", fmt) dt = dt.replace(tzinfo=PT_TZ) - # Roll over year if needed - if dt < datetime.now(PT_TZ).replace(month=1, day=1): + now = datetime.now(PT_TZ) + # roll year forward for future dates that wrap + if not is_past and dt.month < now.month - 2: dt = dt.replace(year=year + 1) break except ValueError: @@ -147,22 +164,23 @@ def scrape() -> list[dict]: "score": score_str, "is_past": is_past, }) + count += 1 except Exception as e: log.debug("Skipping row: %s", e) continue - log.info(" %s: parsed %d matches", mode, sum(1 for m in matches if m["is_past"] == is_past)) + log.info(" %s: %d matches parsed", mode, count) matches.sort(key=lambda x: x["timestamp"]) - log.info("Total: %d past + %d future", + log.info("Total: %d past + %d future", sum(1 for m in matches if m["is_past"]), sum(1 for m in matches if not m["is_past"])) return matches def build_output(matches: list[dict]) -> dict: - past = [m for m in matches if m["is_past"] and m.get("score")] + past = [m for m in matches if m["is_past"]] future = [m for m in matches if not m["is_past"]] last2 = past[-2:] if len(past) >= 2 else past