Files
football-api/scraper/scraper.py
rgcosta aabcd8f6a8
All checks were successful
Build & Push Football Docker Images / build-push-update (push) Successful in 7s
Update scraper/scraper.py
2026-04-12 15:39:49 +00:00

236 lines
9.7 KiB
Python

#!/usr/bin/env python3
"""
FC Porto fixture scraper — ESPN
Server-side rendered, no JS needed, no API key required.
Writes /data/porto.json
"""
import json
import logging
import os
import re
from datetime import datetime
from pathlib import Path
from zoneinfo import ZoneInfo
import httpx
from bs4 import BeautifulSoup
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
log = logging.getLogger(__name__)
DATA_FILE = Path("/data/porto.json")
DATA_FILE.parent.mkdir(parents=True, exist_ok=True)
TEAM_ID = os.environ.get("ESPN_TEAM_ID", "437")
TEAM_SLUG = os.environ.get("ESPN_TEAM_SLUG", "por.porto")
TEAM_NAME = os.environ.get("TEAM_NAME", "FC Porto")
PT_TZ = ZoneInfo("Europe/Lisbon")
COMP_MAP = {
"Portuguese Primeira Liga": "LP",
"UEFA Europa League": "UE",
"UEFA Champions League": "CL",
"Taca de Portugal": "TP",
"Taça de Portugal": "TP",
"FIFA Club World Cup": "CW",
"UEFA Super Cup": "SC",
}
HEADERS = {
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
"Chrome/124.0 Safari/537.36",
"Accept-Language": "en-GB,en;q=0.9",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
}
def scrape() -> list[dict]:
log.info("Scraping ESPN for %s (id=%s)...", TEAM_NAME, TEAM_ID)
# ESPN table structure:
# cell[0] = DATE
# cell[1] = HOME TEAM (link)
# cell[2] = SCORE or TIME (link for past, text for future)
# cell[3] = AWAY TEAM (link)
# cell[4] = FT / result status (past) | competition (future some)
# cell[5] = COMPETITION
base = "https://www.espn.com/soccer/team"
pages = {
"future": f"{base}/fixtures/_/id/{TEAM_ID}/{TEAM_SLUG}",
"past": f"{base}/results/_/id/{TEAM_ID}/{TEAM_SLUG}",
}
porto_names = {"fc porto", "porto"}
matches = []
for mode, url in pages.items():
is_past = mode == "past"
try:
r = httpx.get(url, headers=HEADERS, timeout=30, follow_redirects=True)
r.raise_for_status()
soup = BeautifulSoup(r.text, "lxml")
except Exception as e:
log.error("Failed to fetch %s: %s", url, e)
continue
count = 0
for table in soup.find_all("table"):
for row in table.find_all("tr"):
cells = row.find_all("td")
if len(cells) < 5:
continue
try:
# ── Date ──────────────────────────────────────
date_text = cells[0].get_text(strip=True)
if not date_text or date_text == "DATE":
continue
# ── Home team ──────────────────────────────────
home_link = cells[1].find("a", href=re.compile(r"/soccer/team/_/id/\d+"))
if not home_link:
continue
home = home_link.get_text(strip=True)
home_m = re.search(r"/id/(\d+)/", home_link["href"])
home_id = home_m.group(1) if home_m else None
home_logo = f"https://a.espncdn.com/i/teamlogos/soccer/500/{home_id}.png" if home_id else None
# ── Score / time (cell[2]) ─────────────────────────────────────────
# Remove gameNote spans (e.g. "1st Leg", "2nd Leg", aggregate text)
for note in cells[2].find_all("span", class_="gameNote"):
note.decompose()
mid_text = cells[2].get_text(strip=True)
# ── Away team ──────────────────────────────────
away_link = cells[3].find("a", href=re.compile(r"/soccer/team/_/id/\d+"))
if not away_link:
continue
away = away_link.get_text(strip=True)
away_m = re.search(r"/id/(\d+)/", away_link["href"])
away_id = away_m.group(1) if away_m else None
away_logo = f"https://a.espncdn.com/i/teamlogos/soccer/500/{away_id}.png" if away_id else None
# ── Competition ────────────────────────────────
comp_raw = cells[5].get_text(strip=True) if len(cells) > 5 else \
cells[4].get_text(strip=True)
# Strip "FT" from competition if it crept in
if comp_raw in ("FT", "AET", "Pen"):
comp_raw = cells[5].get_text(strip=True) if len(cells) > 5 else ""
abbr = COMP_MAP.get(comp_raw, comp_raw[:2].upper() if comp_raw not in ("", "") else "")
# ── Porto perspective ──────────────────────────
is_home = home.lower() in porto_names
opponent = away if is_home else home
opponent_logo = away_logo if is_home else home_logo
# ── Score vs time ──────────────────────────────────────────────────
score_str = None
time_str = "TBD"
if is_past:
# Extract just the score digits, ignore leg/aggregate text
score_match = re.match(r"^(\d{1,2})\s*-\s*(\d{1,2})", mid_text)
if score_match:
score_str = f"{score_match.group(1)}-{score_match.group(2)}"
else:
continue
else:
time_str = mid_text if mid_text not in ("", "TBD") else "TBD"
# ── Parse date ─────────────────────────────────────────────────────
dt = None
now = datetime.now(PT_TZ)
year = now.year
for fmt in ("%a, %b %d %Y", "%A, %B %d %Y"):
try:
dt = datetime.strptime(f"{date_text} {year}", fmt)
dt = dt.replace(tzinfo=PT_TZ)
if is_past:
if dt.date() > now.date():
dt = dt.replace(year=year - 1)
else:
# Use date() comparison so today's unplayed games stay as future
if dt.date() < now.date():
dt = dt.replace(year=year + 1)
break
except ValueError:
continue
matches.append({
"home": home,
"away": away,
"home_logo": home_logo,
"away_logo": away_logo,
"opponent": opponent,
"opponent_logo": opponent_logo,
"is_home": is_home,
"competition": comp_raw,
"abbr": abbr,
"date": dt.strftime("%d/%m/%y") if dt else date_text,
"time": time_str,
"timestamp": dt.timestamp() if dt else 0,
"score": score_str,
"is_past": is_past,
})
count += 1
except Exception as e:
log.debug("Skipping row: %s", e)
continue
log.info(" %s: %d matches parsed", mode, count)
matches.sort(key=lambda x: x["timestamp"])
log.info("Total: %d past + %d future",
sum(1 for m in matches if m["is_past"]),
sum(1 for m in matches if not m["is_past"]))
return matches
def build_output(matches: list[dict]) -> dict:
past = [m for m in matches if m["is_past"]]
future = [m for m in matches if not m["is_past"]]
last2 = past[-2:] if len(past) >= 2 else past
next2 = future[:2] if len(future) >= 2 else future
empty = {
"home": "", "away": "", "home_logo": None, "away_logo": None,
"opponent": "", "opponent_logo": None, "is_home": True,
"competition": "", "abbr": "",
"date": "", "time": "", "timestamp": 0,
"score": None, "is_past": False,
}
while len(last2) < 2:
last2.insert(0, dict(empty))
while len(next2) < 2:
next2.append(dict(empty))
return {
"updated_at": datetime.now(tz=PT_TZ).isoformat(),
"team": TEAM_NAME,
"display": last2 + next2,
"all_past": past,
"all_future": future,
}
def main():
matches = scrape()
if not matches:
log.warning("No matches found — keeping existing data if present")
if DATA_FILE.exists():
return
matches = []
output = build_output(matches)
DATA_FILE.write_text(json.dumps(output, indent=2, ensure_ascii=False))
log.info("Wrote %d total matches to %s", len(matches), DATA_FILE)
if __name__ == "__main__":
main()