Files
football-api/scrapper.py
2026-03-13 18:00:42 +00:00

215 lines
5.9 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
"""
FC Porto scraper — zerozero.pt
Scrapes last 2 results and next 2 fixtures including TV channel.
Writes /data/porto.json
"""
import json
import logging
import re
from datetime import datetime
from pathlib import Path
from zoneinfo import ZoneInfo
import httpx
from bs4 import BeautifulSoup
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
log = logging.getLogger(__name__)
DATA_FILE = Path("/data/porto.json")
DATA_FILE.parent.mkdir(parents=True, exist_ok=True)
PT_TZ = ZoneInfo("Europe/Lisbon")
HEADERS = {
"User-Agent": (
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"
),
"Accept-Language": "pt-PT,pt;q=0.9,en;q=0.8",
}
TV_CHANNELS = [
"Sport TV 1", "Sport TV 2", "Sport TV 3", "Sport TV 4", "Sport TV+",
"RTP 1", "RTP1", "SIC", "TVI", "Eleven Sports", "DAZN", "Canal+"
]
COMPETITIONS = {
"liga portugal": "Liga Portugal",
"primeira liga": "Liga Portugal",
"champions league": "Champions League",
"liga dos campeões": "Champions League",
"europa league": "Europa League",
"liga europa": "Europa League",
"taça de portugal": "Taça de Portugal",
"taca de portugal": "Taça de Portugal",
"supertaça": "Supertaça",
}
def fetch(url: str) -> BeautifulSoup | None:
try:
r = httpx.get(url, headers=HEADERS, timeout=30, follow_redirects=True)
r.raise_for_status()
return BeautifulSoup(r.text, "html.parser")
except Exception as e:
log.error("Failed to fetch %s: %s", url, e)
return None
def normalise_competition(raw: str) -> str:
for key, val in COMPETITIONS.items():
if key in raw.lower():
return val
return raw.strip().title() if raw else "Liga Portugal"
def find_channel(text: str) -> str:
text_upper = text.upper()
for ch in TV_CHANNELS:
if ch.upper() in text_upper:
return ch
return "TBD"
def parse_score(text: str) -> str | None:
m = re.search(r"(\d+)\s*[-]\s*(\d+)", text)
return f"{m.group(1)} - {m.group(2)}" if m else None
def parse_date(text: str) -> datetime | None:
m = re.search(r"(\d{1,2})[/\-](\d{1,2})[/\-](\d{4})", text)
if not m:
return None
tm = re.search(r"(\d{2}):(\d{2})", text)
hour, minute = (int(tm.group(1)), int(tm.group(2))) if tm else (0, 0)
try:
return datetime(
int(m.group(3)), int(m.group(2)), int(m.group(1)),
hour, minute, tzinfo=PT_TZ
)
except ValueError:
return None
def scrape_zerozero() -> list[dict]:
log.info("Scraping zerozero.pt...")
# Porto team page — jogos tab
soup = fetch("https://www.zerozero.pt/equipa.php?id=8&epoca_id=165")
if not soup:
return []
matches = []
now = datetime.now(tz=PT_TZ)
rows = soup.select("table tr.odd, table tr.even")
log.info("Found %d rows", len(rows))
for row in rows:
cells = row.find_all("td")
if len(cells) < 3:
continue
full_text = row.get_text(" ", strip=True)
dt = parse_date(full_text)
if not dt:
continue
# Teams — look for equipa links
teams = []
for a in row.find_all("a", href=True):
if "equipa.php" in a["href"]:
name = a.get_text(strip=True)
if name and len(name) > 1:
teams.append(name)
if len(teams) < 2:
continue
home, away = teams[0], teams[1]
porto_names = {"fc porto", "porto", "f.c. porto"}
is_home = home.lower() in porto_names
opponent = away if is_home else home
score = parse_score(full_text)
is_past = dt < now
# Competition
comp_raw = ""
for a in row.find_all("a", href=True):
if "competicao" in a["href"] or "taça" in a.get_text().lower():
comp_raw = a.get_text(strip=True)
break
competition = normalise_competition(comp_raw)
channel = find_channel(full_text)
matches.append({
"home": home,
"away": away,
"opponent": opponent,
"is_home": is_home,
"competition": competition,
"channel": channel,
"date": dt.strftime("%d/%m/%y"),
"time": dt.strftime("%H:%M") if dt.hour or dt.minute else "TBD",
"timestamp": dt.timestamp(),
"score": score,
"is_past": is_past,
})
matches.sort(key=lambda x: x["timestamp"])
return matches
def build_output(matches: list[dict]) -> dict:
now = datetime.now(tz=PT_TZ)
past = [m for m in matches if m["is_past"] and m.get("score")]
future = [m for m in matches if not m["is_past"]]
# last 2 results + next 2 fixtures
last2 = past[-2:] if len(past) >= 2 else past
next2 = future[:2] if len(future) >= 2 else future
# pad if needed
empty = {
"home": "", "away": "", "opponent": "", "is_home": True,
"competition": "", "channel": "", "date": "", "time": "",
"timestamp": 0, "score": None, "is_past": False,
}
while len(last2) < 2:
last2.insert(0, empty)
while len(next2) < 2:
next2.append(empty)
return {
"updated_at": now.isoformat(),
"team": "FC Porto",
"display": last2 + next2, # always 4 items: [past, past, future, future]
"all_past": past,
"all_future": future,
}
def main():
matches = scrape_zerozero()
if not matches:
log.warning("No matches found — keeping existing data")
if DATA_FILE.exists():
return
matches = []
output = build_output(matches)
DATA_FILE.write_text(json.dumps(output, indent=2, ensure_ascii=False))
log.info(
"Wrote %d past + %d future matches",
len(output["all_past"]), len(output["all_future"])
)
if __name__ == "__main__":
main()