Update scraper/scraper.py
All checks were successful
Build & Push Football Docker Images / build-push-update (push) Successful in 8s

This commit is contained in:
2026-04-12 13:17:15 +00:00
parent e4906c0e2a
commit 9428088c13

View File

@@ -1,12 +1,13 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
""" """
FC Porto scraper — zerozero.pt FC Porto fixture scraper — ESPN
Scrapes last 2 results and next 2 fixtures including TV channel. Server-side rendered, no JS needed, no API key required.
Writes /data/porto.json Writes /data/porto.json
""" """
import json import json
import logging import logging
import os
import re import re
from datetime import datetime from datetime import datetime
from pathlib import Path from pathlib import Path
@@ -21,194 +22,185 @@ log = logging.getLogger(__name__)
DATA_FILE = Path("/data/porto.json") DATA_FILE = Path("/data/porto.json")
DATA_FILE.parent.mkdir(parents=True, exist_ok=True) DATA_FILE.parent.mkdir(parents=True, exist_ok=True)
PT_TZ = ZoneInfo("Europe/Lisbon") TEAM_ID = os.environ.get("ESPN_TEAM_ID", "437")
TEAM_SLUG = os.environ.get("ESPN_TEAM_SLUG", "por.porto")
TEAM_NAME = os.environ.get("TEAM_NAME", "FC Porto")
PT_TZ = ZoneInfo("Europe/Lisbon")
COMP_MAP = {
"Portuguese Primeira Liga": "LP",
"UEFA Europa League": "UE",
"UEFA Champions League": "CL",
"Taca de Portugal": "TP",
"Taça de Portugal": "TP",
"FIFA Club World Cup": "CW",
"UEFA Super Cup": "SC",
}
HEADERS = { HEADERS = {
"User-Agent": ( "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 " "Chrome/124.0 Safari/537.36",
"(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36" "Accept-Language": "en-GB,en;q=0.9",
), "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "pt-PT,pt;q=0.9,en;q=0.8",
}
TV_CHANNELS = [
"Sport TV 1", "Sport TV 2", "Sport TV 3", "Sport TV 4", "Sport TV+",
"RTP 1", "RTP1", "SIC", "TVI", "Eleven Sports", "DAZN", "Canal+"
]
COMPETITIONS = {
"liga portugal": "Liga Portugal",
"primeira liga": "Liga Portugal",
"champions league": "Champions League",
"liga dos campeões": "Champions League",
"europa league": "Europa League",
"liga europa": "Europa League",
"taça de portugal": "Taça de Portugal",
"taca de portugal": "Taça de Portugal",
"supertaça": "Supertaça",
} }
def fetch(url: str) -> BeautifulSoup | None: def scrape() -> list[dict]:
try: log.info("Scraping ESPN for %s (id=%s)...", TEAM_NAME, TEAM_ID)
r = httpx.get(url, headers=HEADERS, timeout=30, follow_redirects=True)
r.raise_for_status()
return BeautifulSoup(r.text, "html.parser")
except Exception as e:
log.error("Failed to fetch %s: %s", url, e)
return None
base = "https://www.espn.com/soccer/team"
urls = {
"future": f"{base}/fixtures/_/id/{TEAM_ID}/{TEAM_SLUG}",
"past": f"{base}/results/_/id/{TEAM_ID}/{TEAM_SLUG}",
}
def normalise_competition(raw: str) -> str: porto_names = {"fc porto", "porto"}
for key, val in COMPETITIONS.items(): matches = []
if key in raw.lower():
return val
return raw.strip().title() if raw else "Liga Portugal"
for mode, url in urls.items():
is_past = mode == "past"
def find_channel(text: str) -> str: try:
text_upper = text.upper() r = httpx.get(url, headers=HEADERS, timeout=30, follow_redirects=True)
for ch in TV_CHANNELS: r.raise_for_status()
if ch.upper() in text_upper: soup = BeautifulSoup(r.text, "lxml")
return ch except Exception as e:
return "TBD" log.error("Failed to fetch %s: %s", url, e)
def parse_score(text: str) -> str | None:
m = re.search(r"(\d+)\s*[-]\s*(\d+)", text)
return f"{m.group(1)} - {m.group(2)}" if m else None
def parse_date(text: str) -> datetime | None:
m = re.search(r"(\d{1,2})[/\-](\d{1,2})[/\-](\d{4})", text)
if not m:
return None
tm = re.search(r"(\d{2}):(\d{2})", text)
hour, minute = (int(tm.group(1)), int(tm.group(2))) if tm else (0, 0)
try:
return datetime(
int(m.group(3)), int(m.group(2)), int(m.group(1)),
hour, minute, tzinfo=PT_TZ
)
except ValueError:
return None
def scrape_zerozero() -> list[dict]:
log.info("Scraping zerozero.pt...")
# Porto team page — jogos tab
soup = fetch("https://www.zerozero.pt/equipa.php?id=8&epoca_id=165")
if not soup:
return []
matches = []
now = datetime.now(tz=PT_TZ)
rows = soup.select("table tr.odd, table tr.even")
log.info("Found %d rows", len(rows))
for row in rows:
cells = row.find_all("td")
if len(cells) < 3:
continue continue
full_text = row.get_text(" ", strip=True) for table in soup.find_all("table"):
for row in table.find_all("tr"):
cells = row.find_all("td")
if len(cells) < 4:
continue
try:
# ── Date ──────────────────────────────────────────
date_text = cells[0].get_text(strip=True)
dt = parse_date(full_text) # ── Teams + logos ──────────────────────────────────
if not dt: match_cell = cells[1]
continue team_links = match_cell.find_all(
"a", href=re.compile(r"/soccer/team/_/id/\d+")
)
if len(team_links) < 2:
continue
# Teams — look for equipa links home = team_links[0].get_text(strip=True)
teams = [] away = team_links[-1].get_text(strip=True)
for a in row.find_all("a", href=True): home_id = re.search(r"/id/(\d+)/", team_links[0]["href"])
if "equipa.php" in a["href"]: away_id = re.search(r"/id/(\d+)/", team_links[-1]["href"])
name = a.get_text(strip=True) home_logo = (
if name and len(name) > 1: f"https://a.espncdn.com/i/teamlogos/soccer/500/{home_id.group(1)}.png"
teams.append(name) if home_id else None
)
away_logo = (
f"https://a.espncdn.com/i/teamlogos/soccer/500/{away_id.group(1)}.png"
if away_id else None
)
if len(teams) < 2: is_home = home.lower() in porto_names
continue opponent = away if is_home else home
opponent_logo = away_logo if is_home else home_logo
home, away = teams[0], teams[1] # ── Score / time ───────────────────────────────────
porto_names = {"fc porto", "porto", "f.c. porto"} score_cell = cells[3] if len(cells) > 3 else cells[2]
is_home = home.lower() in porto_names score_text = score_cell.get_text(strip=True)
opponent = away if is_home else home score_str = None
time_str = "TBD"
score = parse_score(full_text) if is_past and re.match(r"\d+\s*-\s*\d+", score_text):
is_past = dt < now score_str = score_text.replace(" ", "")
elif not is_past:
time_str = score_text if score_text not in ("", "TBD") else "TBD"
# Competition # ── Competition ────────────────────────────────────
comp_raw = "" comp_raw = cells[5].get_text(strip=True) if len(cells) > 5 else ""
for a in row.find_all("a", href=True): abbr = COMP_MAP.get(comp_raw, comp_raw[:2].upper() if comp_raw != "" else "")
if "competicao" in a["href"] or "taça" in a.get_text().lower():
comp_raw = a.get_text(strip=True)
break
competition = normalise_competition(comp_raw)
channel = find_channel(full_text) # ── Date parse ─────────────────────────────────────
dt = None
year = datetime.now(PT_TZ).year
for fmt in ("%a, %b %d %Y", "%A, %B %d %Y"):
try:
dt = datetime.strptime(f"{date_text} {year}", fmt)
dt = dt.replace(tzinfo=PT_TZ)
# Roll over year if needed
if dt < datetime.now(PT_TZ).replace(month=1, day=1):
dt = dt.replace(year=year + 1)
break
except ValueError:
continue
matches.append({ matches.append({
"home": home, "home": home,
"away": away, "away": away,
"opponent": opponent, "home_logo": home_logo,
"is_home": is_home, "away_logo": away_logo,
"competition": competition, "opponent": opponent,
"channel": channel, "opponent_logo": opponent_logo,
"date": dt.strftime("%d/%m/%y"), "is_home": is_home,
"time": dt.strftime("%H:%M") if dt.hour or dt.minute else "TBD", "competition": comp_raw,
"timestamp": dt.timestamp(), "abbr": abbr,
"score": score, "date": dt.strftime("%d/%m/%y") if dt else date_text,
"is_past": is_past, "time": time_str,
}) "timestamp": dt.timestamp() if dt else 0,
"score": score_str,
"is_past": is_past,
})
except Exception as e:
log.debug("Skipping row: %s", e)
continue
log.info(" %s: parsed %d matches", mode, sum(1 for m in matches if m["is_past"] == is_past))
matches.sort(key=lambda x: x["timestamp"]) matches.sort(key=lambda x: x["timestamp"])
log.info("Total: %d past + %d future",
sum(1 for m in matches if m["is_past"]),
sum(1 for m in matches if not m["is_past"]))
return matches return matches
def build_output(matches: list[dict]) -> dict: def build_output(matches: list[dict]) -> dict:
now = datetime.now(tz=PT_TZ) past = [m for m in matches if m["is_past"] and m.get("score")]
past = [m for m in matches if m["is_past"] and m.get("score")]
future = [m for m in matches if not m["is_past"]] future = [m for m in matches if not m["is_past"]]
# last 2 results + next 2 fixtures last2 = past[-2:] if len(past) >= 2 else past
last2 = past[-2:] if len(past) >= 2 else past
next2 = future[:2] if len(future) >= 2 else future next2 = future[:2] if len(future) >= 2 else future
# pad if needed
empty = { empty = {
"home": "", "away": "", "opponent": "", "is_home": True, "home": "", "away": "", "home_logo": None, "away_logo": None,
"competition": "", "channel": "", "date": "", "time": "", "opponent": "", "opponent_logo": None, "is_home": True,
"timestamp": 0, "score": None, "is_past": False, "competition": "", "abbr": "",
"date": "", "time": "", "timestamp": 0,
"score": None, "is_past": False,
} }
while len(last2) < 2: while len(last2) < 2:
last2.insert(0, empty) last2.insert(0, dict(empty))
while len(next2) < 2: while len(next2) < 2:
next2.append(empty) next2.append(dict(empty))
return { return {
"updated_at": now.isoformat(), "updated_at": datetime.now(tz=PT_TZ).isoformat(),
"team": "FC Porto", "team": TEAM_NAME,
"display": last2 + next2, # always 4 items: [past, past, future, future] "display": last2 + next2,
"all_past": past, "all_past": past,
"all_future": future, "all_future": future,
} }
def main(): def main():
matches = scrape_zerozero() matches = scrape()
if not matches: if not matches:
log.warning("No matches found — keeping existing data") log.warning("No matches found — keeping existing data if present")
if DATA_FILE.exists(): if DATA_FILE.exists():
return return
matches = [] matches = []
output = build_output(matches) output = build_output(matches)
DATA_FILE.write_text(json.dumps(output, indent=2, ensure_ascii=False)) DATA_FILE.write_text(json.dumps(output, indent=2, ensure_ascii=False))
log.info( log.info("Wrote %d total matches to %s", len(matches), DATA_FILE)
"Wrote %d past + %d future matches",
len(output["all_past"]), len(output["all_future"])
)
if __name__ == "__main__": if __name__ == "__main__":