Update scraper/scraper.py
All checks were successful
Build & Push Football Docker Images / build-push-update (push) Successful in 8s
All checks were successful
Build & Push Football Docker Images / build-push-update (push) Successful in 8s
This commit is contained in:
@@ -1,12 +1,13 @@
|
|||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
"""
|
"""
|
||||||
FC Porto scraper — zerozero.pt
|
FC Porto fixture scraper — ESPN
|
||||||
Scrapes last 2 results and next 2 fixtures including TV channel.
|
Server-side rendered, no JS needed, no API key required.
|
||||||
Writes /data/porto.json
|
Writes /data/porto.json
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
|
import os
|
||||||
import re
|
import re
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
@@ -21,194 +22,185 @@ log = logging.getLogger(__name__)
|
|||||||
DATA_FILE = Path("/data/porto.json")
|
DATA_FILE = Path("/data/porto.json")
|
||||||
DATA_FILE.parent.mkdir(parents=True, exist_ok=True)
|
DATA_FILE.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
PT_TZ = ZoneInfo("Europe/Lisbon")
|
TEAM_ID = os.environ.get("ESPN_TEAM_ID", "437")
|
||||||
|
TEAM_SLUG = os.environ.get("ESPN_TEAM_SLUG", "por.porto")
|
||||||
|
TEAM_NAME = os.environ.get("TEAM_NAME", "FC Porto")
|
||||||
|
PT_TZ = ZoneInfo("Europe/Lisbon")
|
||||||
|
|
||||||
|
COMP_MAP = {
|
||||||
|
"Portuguese Primeira Liga": "LP",
|
||||||
|
"UEFA Europa League": "UE",
|
||||||
|
"UEFA Champions League": "CL",
|
||||||
|
"Taca de Portugal": "TP",
|
||||||
|
"Taça de Portugal": "TP",
|
||||||
|
"FIFA Club World Cup": "CW",
|
||||||
|
"UEFA Super Cup": "SC",
|
||||||
|
}
|
||||||
|
|
||||||
HEADERS = {
|
HEADERS = {
|
||||||
"User-Agent": (
|
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
|
||||||
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
|
"Chrome/124.0 Safari/537.36",
|
||||||
"(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"
|
"Accept-Language": "en-GB,en;q=0.9",
|
||||||
),
|
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
||||||
"Accept-Language": "pt-PT,pt;q=0.9,en;q=0.8",
|
|
||||||
}
|
|
||||||
|
|
||||||
TV_CHANNELS = [
|
|
||||||
"Sport TV 1", "Sport TV 2", "Sport TV 3", "Sport TV 4", "Sport TV+",
|
|
||||||
"RTP 1", "RTP1", "SIC", "TVI", "Eleven Sports", "DAZN", "Canal+"
|
|
||||||
]
|
|
||||||
|
|
||||||
COMPETITIONS = {
|
|
||||||
"liga portugal": "Liga Portugal",
|
|
||||||
"primeira liga": "Liga Portugal",
|
|
||||||
"champions league": "Champions League",
|
|
||||||
"liga dos campeões": "Champions League",
|
|
||||||
"europa league": "Europa League",
|
|
||||||
"liga europa": "Europa League",
|
|
||||||
"taça de portugal": "Taça de Portugal",
|
|
||||||
"taca de portugal": "Taça de Portugal",
|
|
||||||
"supertaça": "Supertaça",
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def fetch(url: str) -> BeautifulSoup | None:
|
def scrape() -> list[dict]:
|
||||||
try:
|
log.info("Scraping ESPN for %s (id=%s)...", TEAM_NAME, TEAM_ID)
|
||||||
r = httpx.get(url, headers=HEADERS, timeout=30, follow_redirects=True)
|
|
||||||
r.raise_for_status()
|
|
||||||
return BeautifulSoup(r.text, "html.parser")
|
|
||||||
except Exception as e:
|
|
||||||
log.error("Failed to fetch %s: %s", url, e)
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
base = "https://www.espn.com/soccer/team"
|
||||||
|
urls = {
|
||||||
|
"future": f"{base}/fixtures/_/id/{TEAM_ID}/{TEAM_SLUG}",
|
||||||
|
"past": f"{base}/results/_/id/{TEAM_ID}/{TEAM_SLUG}",
|
||||||
|
}
|
||||||
|
|
||||||
def normalise_competition(raw: str) -> str:
|
porto_names = {"fc porto", "porto"}
|
||||||
for key, val in COMPETITIONS.items():
|
matches = []
|
||||||
if key in raw.lower():
|
|
||||||
return val
|
|
||||||
return raw.strip().title() if raw else "Liga Portugal"
|
|
||||||
|
|
||||||
|
for mode, url in urls.items():
|
||||||
|
is_past = mode == "past"
|
||||||
|
|
||||||
def find_channel(text: str) -> str:
|
try:
|
||||||
text_upper = text.upper()
|
r = httpx.get(url, headers=HEADERS, timeout=30, follow_redirects=True)
|
||||||
for ch in TV_CHANNELS:
|
r.raise_for_status()
|
||||||
if ch.upper() in text_upper:
|
soup = BeautifulSoup(r.text, "lxml")
|
||||||
return ch
|
except Exception as e:
|
||||||
return "TBD"
|
log.error("Failed to fetch %s: %s", url, e)
|
||||||
|
|
||||||
|
|
||||||
def parse_score(text: str) -> str | None:
|
|
||||||
m = re.search(r"(\d+)\s*[-–]\s*(\d+)", text)
|
|
||||||
return f"{m.group(1)} - {m.group(2)}" if m else None
|
|
||||||
|
|
||||||
|
|
||||||
def parse_date(text: str) -> datetime | None:
|
|
||||||
m = re.search(r"(\d{1,2})[/\-](\d{1,2})[/\-](\d{4})", text)
|
|
||||||
if not m:
|
|
||||||
return None
|
|
||||||
tm = re.search(r"(\d{2}):(\d{2})", text)
|
|
||||||
hour, minute = (int(tm.group(1)), int(tm.group(2))) if tm else (0, 0)
|
|
||||||
try:
|
|
||||||
return datetime(
|
|
||||||
int(m.group(3)), int(m.group(2)), int(m.group(1)),
|
|
||||||
hour, minute, tzinfo=PT_TZ
|
|
||||||
)
|
|
||||||
except ValueError:
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
def scrape_zerozero() -> list[dict]:
|
|
||||||
log.info("Scraping zerozero.pt...")
|
|
||||||
# Porto team page — jogos tab
|
|
||||||
soup = fetch("https://www.zerozero.pt/equipa.php?id=8&epoca_id=165")
|
|
||||||
if not soup:
|
|
||||||
return []
|
|
||||||
|
|
||||||
matches = []
|
|
||||||
now = datetime.now(tz=PT_TZ)
|
|
||||||
|
|
||||||
rows = soup.select("table tr.odd, table tr.even")
|
|
||||||
log.info("Found %d rows", len(rows))
|
|
||||||
|
|
||||||
for row in rows:
|
|
||||||
cells = row.find_all("td")
|
|
||||||
if len(cells) < 3:
|
|
||||||
continue
|
continue
|
||||||
|
|
||||||
full_text = row.get_text(" ", strip=True)
|
for table in soup.find_all("table"):
|
||||||
|
for row in table.find_all("tr"):
|
||||||
|
cells = row.find_all("td")
|
||||||
|
if len(cells) < 4:
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
# ── Date ──────────────────────────────────────────
|
||||||
|
date_text = cells[0].get_text(strip=True)
|
||||||
|
|
||||||
dt = parse_date(full_text)
|
# ── Teams + logos ──────────────────────────────────
|
||||||
if not dt:
|
match_cell = cells[1]
|
||||||
continue
|
team_links = match_cell.find_all(
|
||||||
|
"a", href=re.compile(r"/soccer/team/_/id/\d+")
|
||||||
|
)
|
||||||
|
if len(team_links) < 2:
|
||||||
|
continue
|
||||||
|
|
||||||
# Teams — look for equipa links
|
home = team_links[0].get_text(strip=True)
|
||||||
teams = []
|
away = team_links[-1].get_text(strip=True)
|
||||||
for a in row.find_all("a", href=True):
|
home_id = re.search(r"/id/(\d+)/", team_links[0]["href"])
|
||||||
if "equipa.php" in a["href"]:
|
away_id = re.search(r"/id/(\d+)/", team_links[-1]["href"])
|
||||||
name = a.get_text(strip=True)
|
home_logo = (
|
||||||
if name and len(name) > 1:
|
f"https://a.espncdn.com/i/teamlogos/soccer/500/{home_id.group(1)}.png"
|
||||||
teams.append(name)
|
if home_id else None
|
||||||
|
)
|
||||||
|
away_logo = (
|
||||||
|
f"https://a.espncdn.com/i/teamlogos/soccer/500/{away_id.group(1)}.png"
|
||||||
|
if away_id else None
|
||||||
|
)
|
||||||
|
|
||||||
if len(teams) < 2:
|
is_home = home.lower() in porto_names
|
||||||
continue
|
opponent = away if is_home else home
|
||||||
|
opponent_logo = away_logo if is_home else home_logo
|
||||||
|
|
||||||
home, away = teams[0], teams[1]
|
# ── Score / time ───────────────────────────────────
|
||||||
porto_names = {"fc porto", "porto", "f.c. porto"}
|
score_cell = cells[3] if len(cells) > 3 else cells[2]
|
||||||
is_home = home.lower() in porto_names
|
score_text = score_cell.get_text(strip=True)
|
||||||
opponent = away if is_home else home
|
score_str = None
|
||||||
|
time_str = "TBD"
|
||||||
|
|
||||||
score = parse_score(full_text)
|
if is_past and re.match(r"\d+\s*-\s*\d+", score_text):
|
||||||
is_past = dt < now
|
score_str = score_text.replace(" ", "")
|
||||||
|
elif not is_past:
|
||||||
|
time_str = score_text if score_text not in ("", "TBD") else "TBD"
|
||||||
|
|
||||||
# Competition
|
# ── Competition ────────────────────────────────────
|
||||||
comp_raw = ""
|
comp_raw = cells[5].get_text(strip=True) if len(cells) > 5 else "—"
|
||||||
for a in row.find_all("a", href=True):
|
abbr = COMP_MAP.get(comp_raw, comp_raw[:2].upper() if comp_raw != "—" else "—")
|
||||||
if "competicao" in a["href"] or "taça" in a.get_text().lower():
|
|
||||||
comp_raw = a.get_text(strip=True)
|
|
||||||
break
|
|
||||||
competition = normalise_competition(comp_raw)
|
|
||||||
|
|
||||||
channel = find_channel(full_text)
|
# ── Date parse ─────────────────────────────────────
|
||||||
|
dt = None
|
||||||
|
year = datetime.now(PT_TZ).year
|
||||||
|
for fmt in ("%a, %b %d %Y", "%A, %B %d %Y"):
|
||||||
|
try:
|
||||||
|
dt = datetime.strptime(f"{date_text} {year}", fmt)
|
||||||
|
dt = dt.replace(tzinfo=PT_TZ)
|
||||||
|
# Roll over year if needed
|
||||||
|
if dt < datetime.now(PT_TZ).replace(month=1, day=1):
|
||||||
|
dt = dt.replace(year=year + 1)
|
||||||
|
break
|
||||||
|
except ValueError:
|
||||||
|
continue
|
||||||
|
|
||||||
matches.append({
|
matches.append({
|
||||||
"home": home,
|
"home": home,
|
||||||
"away": away,
|
"away": away,
|
||||||
"opponent": opponent,
|
"home_logo": home_logo,
|
||||||
"is_home": is_home,
|
"away_logo": away_logo,
|
||||||
"competition": competition,
|
"opponent": opponent,
|
||||||
"channel": channel,
|
"opponent_logo": opponent_logo,
|
||||||
"date": dt.strftime("%d/%m/%y"),
|
"is_home": is_home,
|
||||||
"time": dt.strftime("%H:%M") if dt.hour or dt.minute else "TBD",
|
"competition": comp_raw,
|
||||||
"timestamp": dt.timestamp(),
|
"abbr": abbr,
|
||||||
"score": score,
|
"date": dt.strftime("%d/%m/%y") if dt else date_text,
|
||||||
"is_past": is_past,
|
"time": time_str,
|
||||||
})
|
"timestamp": dt.timestamp() if dt else 0,
|
||||||
|
"score": score_str,
|
||||||
|
"is_past": is_past,
|
||||||
|
})
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
log.debug("Skipping row: %s", e)
|
||||||
|
continue
|
||||||
|
|
||||||
|
log.info(" %s: parsed %d matches", mode, sum(1 for m in matches if m["is_past"] == is_past))
|
||||||
|
|
||||||
matches.sort(key=lambda x: x["timestamp"])
|
matches.sort(key=lambda x: x["timestamp"])
|
||||||
|
log.info("Total: %d past + %d future",
|
||||||
|
sum(1 for m in matches if m["is_past"]),
|
||||||
|
sum(1 for m in matches if not m["is_past"]))
|
||||||
return matches
|
return matches
|
||||||
|
|
||||||
|
|
||||||
def build_output(matches: list[dict]) -> dict:
|
def build_output(matches: list[dict]) -> dict:
|
||||||
now = datetime.now(tz=PT_TZ)
|
past = [m for m in matches if m["is_past"] and m.get("score")]
|
||||||
past = [m for m in matches if m["is_past"] and m.get("score")]
|
|
||||||
future = [m for m in matches if not m["is_past"]]
|
future = [m for m in matches if not m["is_past"]]
|
||||||
|
|
||||||
# last 2 results + next 2 fixtures
|
last2 = past[-2:] if len(past) >= 2 else past
|
||||||
last2 = past[-2:] if len(past) >= 2 else past
|
|
||||||
next2 = future[:2] if len(future) >= 2 else future
|
next2 = future[:2] if len(future) >= 2 else future
|
||||||
|
|
||||||
# pad if needed
|
|
||||||
empty = {
|
empty = {
|
||||||
"home": "—", "away": "—", "opponent": "—", "is_home": True,
|
"home": "—", "away": "—", "home_logo": None, "away_logo": None,
|
||||||
"competition": "—", "channel": "—", "date": "—", "time": "—",
|
"opponent": "—", "opponent_logo": None, "is_home": True,
|
||||||
"timestamp": 0, "score": None, "is_past": False,
|
"competition": "—", "abbr": "—",
|
||||||
|
"date": "—", "time": "—", "timestamp": 0,
|
||||||
|
"score": None, "is_past": False,
|
||||||
}
|
}
|
||||||
while len(last2) < 2:
|
while len(last2) < 2:
|
||||||
last2.insert(0, empty)
|
last2.insert(0, dict(empty))
|
||||||
while len(next2) < 2:
|
while len(next2) < 2:
|
||||||
next2.append(empty)
|
next2.append(dict(empty))
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"updated_at": now.isoformat(),
|
"updated_at": datetime.now(tz=PT_TZ).isoformat(),
|
||||||
"team": "FC Porto",
|
"team": TEAM_NAME,
|
||||||
"display": last2 + next2, # always 4 items: [past, past, future, future]
|
"display": last2 + next2,
|
||||||
"all_past": past,
|
"all_past": past,
|
||||||
"all_future": future,
|
"all_future": future,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
matches = scrape_zerozero()
|
matches = scrape()
|
||||||
|
|
||||||
if not matches:
|
if not matches:
|
||||||
log.warning("No matches found — keeping existing data")
|
log.warning("No matches found — keeping existing data if present")
|
||||||
if DATA_FILE.exists():
|
if DATA_FILE.exists():
|
||||||
return
|
return
|
||||||
matches = []
|
matches = []
|
||||||
|
|
||||||
output = build_output(matches)
|
output = build_output(matches)
|
||||||
DATA_FILE.write_text(json.dumps(output, indent=2, ensure_ascii=False))
|
DATA_FILE.write_text(json.dumps(output, indent=2, ensure_ascii=False))
|
||||||
log.info(
|
log.info("Wrote %d total matches to %s", len(matches), DATA_FILE)
|
||||||
"Wrote %d past + %d future matches",
|
|
||||||
len(output["all_past"]), len(output["all_future"])
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|||||||
Reference in New Issue
Block a user