Update scraper/scraper.py
All checks were successful
Build & Push Football Docker Images / build-push-update (push) Successful in 8s

This commit is contained in:
2026-04-12 13:17:15 +00:00
parent e4906c0e2a
commit 9428088c13

View File

@@ -1,12 +1,13 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
""" """
FC Porto scraper — zerozero.pt FC Porto fixture scraper — ESPN
Scrapes last 2 results and next 2 fixtures including TV channel. Server-side rendered, no JS needed, no API key required.
Writes /data/porto.json Writes /data/porto.json
""" """
import json import json
import logging import logging
import os
import re import re
from datetime import datetime from datetime import datetime
from pathlib import Path from pathlib import Path
@@ -21,194 +22,185 @@ log = logging.getLogger(__name__)
DATA_FILE = Path("/data/porto.json") DATA_FILE = Path("/data/porto.json")
DATA_FILE.parent.mkdir(parents=True, exist_ok=True) DATA_FILE.parent.mkdir(parents=True, exist_ok=True)
TEAM_ID = os.environ.get("ESPN_TEAM_ID", "437")
TEAM_SLUG = os.environ.get("ESPN_TEAM_SLUG", "por.porto")
TEAM_NAME = os.environ.get("TEAM_NAME", "FC Porto")
PT_TZ = ZoneInfo("Europe/Lisbon") PT_TZ = ZoneInfo("Europe/Lisbon")
COMP_MAP = {
"Portuguese Primeira Liga": "LP",
"UEFA Europa League": "UE",
"UEFA Champions League": "CL",
"Taca de Portugal": "TP",
"Taça de Portugal": "TP",
"FIFA Club World Cup": "CW",
"UEFA Super Cup": "SC",
}
HEADERS = { HEADERS = {
"User-Agent": ( "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 " "Chrome/124.0 Safari/537.36",
"(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36" "Accept-Language": "en-GB,en;q=0.9",
), "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "pt-PT,pt;q=0.9,en;q=0.8",
}
TV_CHANNELS = [
"Sport TV 1", "Sport TV 2", "Sport TV 3", "Sport TV 4", "Sport TV+",
"RTP 1", "RTP1", "SIC", "TVI", "Eleven Sports", "DAZN", "Canal+"
]
COMPETITIONS = {
"liga portugal": "Liga Portugal",
"primeira liga": "Liga Portugal",
"champions league": "Champions League",
"liga dos campeões": "Champions League",
"europa league": "Europa League",
"liga europa": "Europa League",
"taça de portugal": "Taça de Portugal",
"taca de portugal": "Taça de Portugal",
"supertaça": "Supertaça",
} }
def fetch(url: str) -> BeautifulSoup | None: def scrape() -> list[dict]:
log.info("Scraping ESPN for %s (id=%s)...", TEAM_NAME, TEAM_ID)
base = "https://www.espn.com/soccer/team"
urls = {
"future": f"{base}/fixtures/_/id/{TEAM_ID}/{TEAM_SLUG}",
"past": f"{base}/results/_/id/{TEAM_ID}/{TEAM_SLUG}",
}
porto_names = {"fc porto", "porto"}
matches = []
for mode, url in urls.items():
is_past = mode == "past"
try: try:
r = httpx.get(url, headers=HEADERS, timeout=30, follow_redirects=True) r = httpx.get(url, headers=HEADERS, timeout=30, follow_redirects=True)
r.raise_for_status() r.raise_for_status()
return BeautifulSoup(r.text, "html.parser") soup = BeautifulSoup(r.text, "lxml")
except Exception as e: except Exception as e:
log.error("Failed to fetch %s: %s", url, e) log.error("Failed to fetch %s: %s", url, e)
return None continue
for table in soup.find_all("table"):
def normalise_competition(raw: str) -> str: for row in table.find_all("tr"):
for key, val in COMPETITIONS.items():
if key in raw.lower():
return val
return raw.strip().title() if raw else "Liga Portugal"
def find_channel(text: str) -> str:
text_upper = text.upper()
for ch in TV_CHANNELS:
if ch.upper() in text_upper:
return ch
return "TBD"
def parse_score(text: str) -> str | None:
m = re.search(r"(\d+)\s*[-]\s*(\d+)", text)
return f"{m.group(1)} - {m.group(2)}" if m else None
def parse_date(text: str) -> datetime | None:
m = re.search(r"(\d{1,2})[/\-](\d{1,2})[/\-](\d{4})", text)
if not m:
return None
tm = re.search(r"(\d{2}):(\d{2})", text)
hour, minute = (int(tm.group(1)), int(tm.group(2))) if tm else (0, 0)
try:
return datetime(
int(m.group(3)), int(m.group(2)), int(m.group(1)),
hour, minute, tzinfo=PT_TZ
)
except ValueError:
return None
def scrape_zerozero() -> list[dict]:
log.info("Scraping zerozero.pt...")
# Porto team page — jogos tab
soup = fetch("https://www.zerozero.pt/equipa.php?id=8&epoca_id=165")
if not soup:
return []
matches = []
now = datetime.now(tz=PT_TZ)
rows = soup.select("table tr.odd, table tr.even")
log.info("Found %d rows", len(rows))
for row in rows:
cells = row.find_all("td") cells = row.find_all("td")
if len(cells) < 3: if len(cells) < 4:
continue
try:
# ── Date ──────────────────────────────────────────
date_text = cells[0].get_text(strip=True)
# ── Teams + logos ──────────────────────────────────
match_cell = cells[1]
team_links = match_cell.find_all(
"a", href=re.compile(r"/soccer/team/_/id/\d+")
)
if len(team_links) < 2:
continue continue
full_text = row.get_text(" ", strip=True) home = team_links[0].get_text(strip=True)
away = team_links[-1].get_text(strip=True)
home_id = re.search(r"/id/(\d+)/", team_links[0]["href"])
away_id = re.search(r"/id/(\d+)/", team_links[-1]["href"])
home_logo = (
f"https://a.espncdn.com/i/teamlogos/soccer/500/{home_id.group(1)}.png"
if home_id else None
)
away_logo = (
f"https://a.espncdn.com/i/teamlogos/soccer/500/{away_id.group(1)}.png"
if away_id else None
)
dt = parse_date(full_text)
if not dt:
continue
# Teams — look for equipa links
teams = []
for a in row.find_all("a", href=True):
if "equipa.php" in a["href"]:
name = a.get_text(strip=True)
if name and len(name) > 1:
teams.append(name)
if len(teams) < 2:
continue
home, away = teams[0], teams[1]
porto_names = {"fc porto", "porto", "f.c. porto"}
is_home = home.lower() in porto_names is_home = home.lower() in porto_names
opponent = away if is_home else home opponent = away if is_home else home
opponent_logo = away_logo if is_home else home_logo
score = parse_score(full_text) # ── Score / time ───────────────────────────────────
is_past = dt < now score_cell = cells[3] if len(cells) > 3 else cells[2]
score_text = score_cell.get_text(strip=True)
score_str = None
time_str = "TBD"
# Competition if is_past and re.match(r"\d+\s*-\s*\d+", score_text):
comp_raw = "" score_str = score_text.replace(" ", "")
for a in row.find_all("a", href=True): elif not is_past:
if "competicao" in a["href"] or "taça" in a.get_text().lower(): time_str = score_text if score_text not in ("", "TBD") else "TBD"
comp_raw = a.get_text(strip=True)
# ── Competition ────────────────────────────────────
comp_raw = cells[5].get_text(strip=True) if len(cells) > 5 else ""
abbr = COMP_MAP.get(comp_raw, comp_raw[:2].upper() if comp_raw != "" else "")
# ── Date parse ─────────────────────────────────────
dt = None
year = datetime.now(PT_TZ).year
for fmt in ("%a, %b %d %Y", "%A, %B %d %Y"):
try:
dt = datetime.strptime(f"{date_text} {year}", fmt)
dt = dt.replace(tzinfo=PT_TZ)
# Roll over year if needed
if dt < datetime.now(PT_TZ).replace(month=1, day=1):
dt = dt.replace(year=year + 1)
break break
competition = normalise_competition(comp_raw) except ValueError:
continue
channel = find_channel(full_text)
matches.append({ matches.append({
"home": home, "home": home,
"away": away, "away": away,
"home_logo": home_logo,
"away_logo": away_logo,
"opponent": opponent, "opponent": opponent,
"opponent_logo": opponent_logo,
"is_home": is_home, "is_home": is_home,
"competition": competition, "competition": comp_raw,
"channel": channel, "abbr": abbr,
"date": dt.strftime("%d/%m/%y"), "date": dt.strftime("%d/%m/%y") if dt else date_text,
"time": dt.strftime("%H:%M") if dt.hour or dt.minute else "TBD", "time": time_str,
"timestamp": dt.timestamp(), "timestamp": dt.timestamp() if dt else 0,
"score": score, "score": score_str,
"is_past": is_past, "is_past": is_past,
}) })
except Exception as e:
log.debug("Skipping row: %s", e)
continue
log.info(" %s: parsed %d matches", mode, sum(1 for m in matches if m["is_past"] == is_past))
matches.sort(key=lambda x: x["timestamp"]) matches.sort(key=lambda x: x["timestamp"])
log.info("Total: %d past + %d future",
sum(1 for m in matches if m["is_past"]),
sum(1 for m in matches if not m["is_past"]))
return matches return matches
def build_output(matches: list[dict]) -> dict: def build_output(matches: list[dict]) -> dict:
now = datetime.now(tz=PT_TZ)
past = [m for m in matches if m["is_past"] and m.get("score")] past = [m for m in matches if m["is_past"] and m.get("score")]
future = [m for m in matches if not m["is_past"]] future = [m for m in matches if not m["is_past"]]
# last 2 results + next 2 fixtures
last2 = past[-2:] if len(past) >= 2 else past last2 = past[-2:] if len(past) >= 2 else past
next2 = future[:2] if len(future) >= 2 else future next2 = future[:2] if len(future) >= 2 else future
# pad if needed
empty = { empty = {
"home": "", "away": "", "opponent": "", "is_home": True, "home": "", "away": "", "home_logo": None, "away_logo": None,
"competition": "", "channel": "", "date": "", "time": "", "opponent": "", "opponent_logo": None, "is_home": True,
"timestamp": 0, "score": None, "is_past": False, "competition": "", "abbr": "",
"date": "", "time": "", "timestamp": 0,
"score": None, "is_past": False,
} }
while len(last2) < 2: while len(last2) < 2:
last2.insert(0, empty) last2.insert(0, dict(empty))
while len(next2) < 2: while len(next2) < 2:
next2.append(empty) next2.append(dict(empty))
return { return {
"updated_at": now.isoformat(), "updated_at": datetime.now(tz=PT_TZ).isoformat(),
"team": "FC Porto", "team": TEAM_NAME,
"display": last2 + next2, # always 4 items: [past, past, future, future] "display": last2 + next2,
"all_past": past, "all_past": past,
"all_future": future, "all_future": future,
} }
def main(): def main():
matches = scrape_zerozero() matches = scrape()
if not matches: if not matches:
log.warning("No matches found — keeping existing data") log.warning("No matches found — keeping existing data if present")
if DATA_FILE.exists(): if DATA_FILE.exists():
return return
matches = [] matches = []
output = build_output(matches) output = build_output(matches)
DATA_FILE.write_text(json.dumps(output, indent=2, ensure_ascii=False)) DATA_FILE.write_text(json.dumps(output, indent=2, ensure_ascii=False))
log.info( log.info("Wrote %d total matches to %s", len(matches), DATA_FILE)
"Wrote %d past + %d future matches",
len(output["all_past"]), len(output["all_future"])
)
if __name__ == "__main__": if __name__ == "__main__":