Update scraper/scrapper.py
Some checks failed
Build & Push Football Docker Images / build-push-update (push) Failing after 3s
Some checks failed
Build & Push Football Docker Images / build-push-update (push) Failing after 3s
This commit is contained in:
215
scraper/scrapper.py
Normal file
215
scraper/scrapper.py
Normal file
@@ -0,0 +1,215 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
FC Porto scraper — zerozero.pt
|
||||
Scrapes last 2 results and next 2 fixtures including TV channel.
|
||||
Writes /data/porto.json
|
||||
"""
|
||||
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from zoneinfo import ZoneInfo
|
||||
|
||||
import httpx
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
DATA_FILE = Path("/data/porto.json")
|
||||
DATA_FILE.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
PT_TZ = ZoneInfo("Europe/Lisbon")
|
||||
|
||||
HEADERS = {
|
||||
"User-Agent": (
|
||||
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
|
||||
"(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"
|
||||
),
|
||||
"Accept-Language": "pt-PT,pt;q=0.9,en;q=0.8",
|
||||
}
|
||||
|
||||
TV_CHANNELS = [
|
||||
"Sport TV 1", "Sport TV 2", "Sport TV 3", "Sport TV 4", "Sport TV+",
|
||||
"RTP 1", "RTP1", "SIC", "TVI", "Eleven Sports", "DAZN", "Canal+"
|
||||
]
|
||||
|
||||
COMPETITIONS = {
|
||||
"liga portugal": "Liga Portugal",
|
||||
"primeira liga": "Liga Portugal",
|
||||
"champions league": "Champions League",
|
||||
"liga dos campeões": "Champions League",
|
||||
"europa league": "Europa League",
|
||||
"liga europa": "Europa League",
|
||||
"taça de portugal": "Taça de Portugal",
|
||||
"taca de portugal": "Taça de Portugal",
|
||||
"supertaça": "Supertaça",
|
||||
}
|
||||
|
||||
|
||||
def fetch(url: str) -> BeautifulSoup | None:
|
||||
try:
|
||||
r = httpx.get(url, headers=HEADERS, timeout=30, follow_redirects=True)
|
||||
r.raise_for_status()
|
||||
return BeautifulSoup(r.text, "html.parser")
|
||||
except Exception as e:
|
||||
log.error("Failed to fetch %s: %s", url, e)
|
||||
return None
|
||||
|
||||
|
||||
def normalise_competition(raw: str) -> str:
|
||||
for key, val in COMPETITIONS.items():
|
||||
if key in raw.lower():
|
||||
return val
|
||||
return raw.strip().title() if raw else "Liga Portugal"
|
||||
|
||||
|
||||
def find_channel(text: str) -> str:
|
||||
text_upper = text.upper()
|
||||
for ch in TV_CHANNELS:
|
||||
if ch.upper() in text_upper:
|
||||
return ch
|
||||
return "TBD"
|
||||
|
||||
|
||||
def parse_score(text: str) -> str | None:
|
||||
m = re.search(r"(\d+)\s*[-–]\s*(\d+)", text)
|
||||
return f"{m.group(1)} - {m.group(2)}" if m else None
|
||||
|
||||
|
||||
def parse_date(text: str) -> datetime | None:
|
||||
m = re.search(r"(\d{1,2})[/\-](\d{1,2})[/\-](\d{4})", text)
|
||||
if not m:
|
||||
return None
|
||||
tm = re.search(r"(\d{2}):(\d{2})", text)
|
||||
hour, minute = (int(tm.group(1)), int(tm.group(2))) if tm else (0, 0)
|
||||
try:
|
||||
return datetime(
|
||||
int(m.group(3)), int(m.group(2)), int(m.group(1)),
|
||||
hour, minute, tzinfo=PT_TZ
|
||||
)
|
||||
except ValueError:
|
||||
return None
|
||||
|
||||
|
||||
def scrape_zerozero() -> list[dict]:
|
||||
log.info("Scraping zerozero.pt...")
|
||||
# Porto team page — jogos tab
|
||||
soup = fetch("https://www.zerozero.pt/equipa.php?id=8&epoca_id=165")
|
||||
if not soup:
|
||||
return []
|
||||
|
||||
matches = []
|
||||
now = datetime.now(tz=PT_TZ)
|
||||
|
||||
rows = soup.select("table tr.odd, table tr.even")
|
||||
log.info("Found %d rows", len(rows))
|
||||
|
||||
for row in rows:
|
||||
cells = row.find_all("td")
|
||||
if len(cells) < 3:
|
||||
continue
|
||||
|
||||
full_text = row.get_text(" ", strip=True)
|
||||
|
||||
dt = parse_date(full_text)
|
||||
if not dt:
|
||||
continue
|
||||
|
||||
# Teams — look for equipa links
|
||||
teams = []
|
||||
for a in row.find_all("a", href=True):
|
||||
if "equipa.php" in a["href"]:
|
||||
name = a.get_text(strip=True)
|
||||
if name and len(name) > 1:
|
||||
teams.append(name)
|
||||
|
||||
if len(teams) < 2:
|
||||
continue
|
||||
|
||||
home, away = teams[0], teams[1]
|
||||
porto_names = {"fc porto", "porto", "f.c. porto"}
|
||||
is_home = home.lower() in porto_names
|
||||
opponent = away if is_home else home
|
||||
|
||||
score = parse_score(full_text)
|
||||
is_past = dt < now
|
||||
|
||||
# Competition
|
||||
comp_raw = ""
|
||||
for a in row.find_all("a", href=True):
|
||||
if "competicao" in a["href"] or "taça" in a.get_text().lower():
|
||||
comp_raw = a.get_text(strip=True)
|
||||
break
|
||||
competition = normalise_competition(comp_raw)
|
||||
|
||||
channel = find_channel(full_text)
|
||||
|
||||
matches.append({
|
||||
"home": home,
|
||||
"away": away,
|
||||
"opponent": opponent,
|
||||
"is_home": is_home,
|
||||
"competition": competition,
|
||||
"channel": channel,
|
||||
"date": dt.strftime("%d/%m/%y"),
|
||||
"time": dt.strftime("%H:%M") if dt.hour or dt.minute else "TBD",
|
||||
"timestamp": dt.timestamp(),
|
||||
"score": score,
|
||||
"is_past": is_past,
|
||||
})
|
||||
|
||||
matches.sort(key=lambda x: x["timestamp"])
|
||||
return matches
|
||||
|
||||
|
||||
def build_output(matches: list[dict]) -> dict:
|
||||
now = datetime.now(tz=PT_TZ)
|
||||
past = [m for m in matches if m["is_past"] and m.get("score")]
|
||||
future = [m for m in matches if not m["is_past"]]
|
||||
|
||||
# last 2 results + next 2 fixtures
|
||||
last2 = past[-2:] if len(past) >= 2 else past
|
||||
next2 = future[:2] if len(future) >= 2 else future
|
||||
|
||||
# pad if needed
|
||||
empty = {
|
||||
"home": "—", "away": "—", "opponent": "—", "is_home": True,
|
||||
"competition": "—", "channel": "—", "date": "—", "time": "—",
|
||||
"timestamp": 0, "score": None, "is_past": False,
|
||||
}
|
||||
while len(last2) < 2:
|
||||
last2.insert(0, empty)
|
||||
while len(next2) < 2:
|
||||
next2.append(empty)
|
||||
|
||||
return {
|
||||
"updated_at": now.isoformat(),
|
||||
"team": "FC Porto",
|
||||
"display": last2 + next2, # always 4 items: [past, past, future, future]
|
||||
"all_past": past,
|
||||
"all_future": future,
|
||||
}
|
||||
|
||||
|
||||
def main():
|
||||
matches = scrape_zerozero()
|
||||
|
||||
if not matches:
|
||||
log.warning("No matches found — keeping existing data")
|
||||
if DATA_FILE.exists():
|
||||
return
|
||||
matches = []
|
||||
|
||||
output = build_output(matches)
|
||||
DATA_FILE.write_text(json.dumps(output, indent=2, ensure_ascii=False))
|
||||
log.info(
|
||||
"Wrote %d past + %d future matches",
|
||||
len(output["all_past"]), len(output["all_future"])
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user