Update scraper/scraper.py
All checks were successful
Build & Push Football Docker Images / build-push-update (push) Successful in 7s

This commit is contained in:
2026-04-12 13:27:16 +00:00
parent 9428088c13
commit 2f8e0a517a

View File

@@ -48,8 +48,16 @@ HEADERS = {
def scrape() -> list[dict]: def scrape() -> list[dict]:
log.info("Scraping ESPN for %s (id=%s)...", TEAM_NAME, TEAM_ID) log.info("Scraping ESPN for %s (id=%s)...", TEAM_NAME, TEAM_ID)
# ESPN table structure:
# cell[0] = DATE
# cell[1] = HOME TEAM (link)
# cell[2] = SCORE or TIME (link for past, text for future)
# cell[3] = AWAY TEAM (link)
# cell[4] = FT / result status (past) | competition (future some)
# cell[5] = COMPETITION
base = "https://www.espn.com/soccer/team" base = "https://www.espn.com/soccer/team"
urls = { pages = {
"future": f"{base}/fixtures/_/id/{TEAM_ID}/{TEAM_SLUG}", "future": f"{base}/fixtures/_/id/{TEAM_ID}/{TEAM_SLUG}",
"past": f"{base}/results/_/id/{TEAM_ID}/{TEAM_SLUG}", "past": f"{base}/results/_/id/{TEAM_ID}/{TEAM_SLUG}",
} }
@@ -57,7 +65,7 @@ def scrape() -> list[dict]:
porto_names = {"fc porto", "porto"} porto_names = {"fc porto", "porto"}
matches = [] matches = []
for mode, url in urls.items(): for mode, url in pages.items():
is_past = mode == "past" is_past = mode == "past"
try: try:
@@ -68,64 +76,73 @@ def scrape() -> list[dict]:
log.error("Failed to fetch %s: %s", url, e) log.error("Failed to fetch %s: %s", url, e)
continue continue
count = 0
for table in soup.find_all("table"): for table in soup.find_all("table"):
for row in table.find_all("tr"): for row in table.find_all("tr"):
cells = row.find_all("td") cells = row.find_all("td")
if len(cells) < 4: if len(cells) < 5:
continue continue
try: try:
# ── Date ────────────────────────────────────────── # ── Date ──────────────────────────────────────
date_text = cells[0].get_text(strip=True) date_text = cells[0].get_text(strip=True)
if not date_text or date_text == "DATE":
# ── Teams + logos ──────────────────────────────────
match_cell = cells[1]
team_links = match_cell.find_all(
"a", href=re.compile(r"/soccer/team/_/id/\d+")
)
if len(team_links) < 2:
continue continue
home = team_links[0].get_text(strip=True) # ── Home team ──────────────────────────────────
away = team_links[-1].get_text(strip=True) home_link = cells[1].find("a", href=re.compile(r"/soccer/team/_/id/\d+"))
home_id = re.search(r"/id/(\d+)/", team_links[0]["href"]) if not home_link:
away_id = re.search(r"/id/(\d+)/", team_links[-1]["href"]) continue
home_logo = ( home = home_link.get_text(strip=True)
f"https://a.espncdn.com/i/teamlogos/soccer/500/{home_id.group(1)}.png" home_m = re.search(r"/id/(\d+)/", home_link["href"])
if home_id else None home_id = home_m.group(1) if home_m else None
) home_logo = f"https://a.espncdn.com/i/teamlogos/soccer/500/{home_id}.png" if home_id else None
away_logo = (
f"https://a.espncdn.com/i/teamlogos/soccer/500/{away_id.group(1)}.png"
if away_id else None
)
# ── Score / time (cell[2]) ─────────────────────
mid_text = cells[2].get_text(strip=True)
# ── Away team ──────────────────────────────────
away_link = cells[3].find("a", href=re.compile(r"/soccer/team/_/id/\d+"))
if not away_link:
continue
away = away_link.get_text(strip=True)
away_m = re.search(r"/id/(\d+)/", away_link["href"])
away_id = away_m.group(1) if away_m else None
away_logo = f"https://a.espncdn.com/i/teamlogos/soccer/500/{away_id}.png" if away_id else None
# ── Competition ────────────────────────────────
comp_raw = cells[5].get_text(strip=True) if len(cells) > 5 else \
cells[4].get_text(strip=True)
# Strip "FT" from competition if it crept in
if comp_raw in ("FT", "AET", "Pen"):
comp_raw = cells[5].get_text(strip=True) if len(cells) > 5 else ""
abbr = COMP_MAP.get(comp_raw, comp_raw[:2].upper() if comp_raw not in ("", "") else "")
# ── Porto perspective ──────────────────────────
is_home = home.lower() in porto_names is_home = home.lower() in porto_names
opponent = away if is_home else home opponent = away if is_home else home
opponent_logo = away_logo if is_home else home_logo opponent_logo = away_logo if is_home else home_logo
# ── Score / time ─────────────────────────────────── # ── Score vs time ──────────────────────────────
score_cell = cells[3] if len(cells) > 3 else cells[2]
score_text = score_cell.get_text(strip=True)
score_str = None score_str = None
time_str = "TBD" time_str = "TBD"
if is_past:
if re.match(r"\d+\s*-\s*\d+", mid_text):
score_str = mid_text.replace(" ", "")
else:
continue # skip if no valid score
else:
time_str = mid_text if mid_text not in ("", "TBD") else "TBD"
if is_past and re.match(r"\d+\s*-\s*\d+", score_text): # ── Parse date ─────────────────────────────────
score_str = score_text.replace(" ", "")
elif not is_past:
time_str = score_text if score_text not in ("", "TBD") else "TBD"
# ── Competition ────────────────────────────────────
comp_raw = cells[5].get_text(strip=True) if len(cells) > 5 else ""
abbr = COMP_MAP.get(comp_raw, comp_raw[:2].upper() if comp_raw != "" else "")
# ── Date parse ─────────────────────────────────────
dt = None dt = None
year = datetime.now(PT_TZ).year year = datetime.now(PT_TZ).year
for fmt in ("%a, %b %d %Y", "%A, %B %d %Y"): for fmt in ("%a, %b %d %Y", "%A, %B %d %Y"):
try: try:
dt = datetime.strptime(f"{date_text} {year}", fmt) dt = datetime.strptime(f"{date_text} {year}", fmt)
dt = dt.replace(tzinfo=PT_TZ) dt = dt.replace(tzinfo=PT_TZ)
# Roll over year if needed now = datetime.now(PT_TZ)
if dt < datetime.now(PT_TZ).replace(month=1, day=1): # roll year forward for future dates that wrap
if not is_past and dt.month < now.month - 2:
dt = dt.replace(year=year + 1) dt = dt.replace(year=year + 1)
break break
except ValueError: except ValueError:
@@ -147,12 +164,13 @@ def scrape() -> list[dict]:
"score": score_str, "score": score_str,
"is_past": is_past, "is_past": is_past,
}) })
count += 1
except Exception as e: except Exception as e:
log.debug("Skipping row: %s", e) log.debug("Skipping row: %s", e)
continue continue
log.info(" %s: parsed %d matches", mode, sum(1 for m in matches if m["is_past"] == is_past)) log.info(" %s: %d matches parsed", mode, count)
matches.sort(key=lambda x: x["timestamp"]) matches.sort(key=lambda x: x["timestamp"])
log.info("Total: %d past + %d future", log.info("Total: %d past + %d future",
@@ -162,7 +180,7 @@ def scrape() -> list[dict]:
def build_output(matches: list[dict]) -> dict: def build_output(matches: list[dict]) -> dict:
past = [m for m in matches if m["is_past"] and m.get("score")] past = [m for m in matches if m["is_past"]]
future = [m for m in matches if not m["is_past"]] future = [m for m in matches if not m["is_past"]]
last2 = past[-2:] if len(past) >= 2 else past last2 = past[-2:] if len(past) >= 2 else past