Update scraper/scraper.py
All checks were successful
Build & Push Football Docker Images / build-push-update (push) Successful in 7s
All checks were successful
Build & Push Football Docker Images / build-push-update (push) Successful in 7s
This commit is contained in:
@@ -48,8 +48,16 @@ HEADERS = {
|
||||
def scrape() -> list[dict]:
|
||||
log.info("Scraping ESPN for %s (id=%s)...", TEAM_NAME, TEAM_ID)
|
||||
|
||||
# ESPN table structure:
|
||||
# cell[0] = DATE
|
||||
# cell[1] = HOME TEAM (link)
|
||||
# cell[2] = SCORE or TIME (link for past, text for future)
|
||||
# cell[3] = AWAY TEAM (link)
|
||||
# cell[4] = FT / result status (past) | competition (future some)
|
||||
# cell[5] = COMPETITION
|
||||
|
||||
base = "https://www.espn.com/soccer/team"
|
||||
urls = {
|
||||
pages = {
|
||||
"future": f"{base}/fixtures/_/id/{TEAM_ID}/{TEAM_SLUG}",
|
||||
"past": f"{base}/results/_/id/{TEAM_ID}/{TEAM_SLUG}",
|
||||
}
|
||||
@@ -57,7 +65,7 @@ def scrape() -> list[dict]:
|
||||
porto_names = {"fc porto", "porto"}
|
||||
matches = []
|
||||
|
||||
for mode, url in urls.items():
|
||||
for mode, url in pages.items():
|
||||
is_past = mode == "past"
|
||||
|
||||
try:
|
||||
@@ -68,64 +76,73 @@ def scrape() -> list[dict]:
|
||||
log.error("Failed to fetch %s: %s", url, e)
|
||||
continue
|
||||
|
||||
count = 0
|
||||
for table in soup.find_all("table"):
|
||||
for row in table.find_all("tr"):
|
||||
cells = row.find_all("td")
|
||||
if len(cells) < 4:
|
||||
if len(cells) < 5:
|
||||
continue
|
||||
try:
|
||||
# ── Date ──────────────────────────────────────────
|
||||
# ── Date ──────────────────────────────────────
|
||||
date_text = cells[0].get_text(strip=True)
|
||||
|
||||
# ── Teams + logos ──────────────────────────────────
|
||||
match_cell = cells[1]
|
||||
team_links = match_cell.find_all(
|
||||
"a", href=re.compile(r"/soccer/team/_/id/\d+")
|
||||
)
|
||||
if len(team_links) < 2:
|
||||
if not date_text or date_text == "DATE":
|
||||
continue
|
||||
|
||||
home = team_links[0].get_text(strip=True)
|
||||
away = team_links[-1].get_text(strip=True)
|
||||
home_id = re.search(r"/id/(\d+)/", team_links[0]["href"])
|
||||
away_id = re.search(r"/id/(\d+)/", team_links[-1]["href"])
|
||||
home_logo = (
|
||||
f"https://a.espncdn.com/i/teamlogos/soccer/500/{home_id.group(1)}.png"
|
||||
if home_id else None
|
||||
)
|
||||
away_logo = (
|
||||
f"https://a.espncdn.com/i/teamlogos/soccer/500/{away_id.group(1)}.png"
|
||||
if away_id else None
|
||||
)
|
||||
# ── Home team ──────────────────────────────────
|
||||
home_link = cells[1].find("a", href=re.compile(r"/soccer/team/_/id/\d+"))
|
||||
if not home_link:
|
||||
continue
|
||||
home = home_link.get_text(strip=True)
|
||||
home_m = re.search(r"/id/(\d+)/", home_link["href"])
|
||||
home_id = home_m.group(1) if home_m else None
|
||||
home_logo = f"https://a.espncdn.com/i/teamlogos/soccer/500/{home_id}.png" if home_id else None
|
||||
|
||||
# ── Score / time (cell[2]) ─────────────────────
|
||||
mid_text = cells[2].get_text(strip=True)
|
||||
|
||||
# ── Away team ──────────────────────────────────
|
||||
away_link = cells[3].find("a", href=re.compile(r"/soccer/team/_/id/\d+"))
|
||||
if not away_link:
|
||||
continue
|
||||
away = away_link.get_text(strip=True)
|
||||
away_m = re.search(r"/id/(\d+)/", away_link["href"])
|
||||
away_id = away_m.group(1) if away_m else None
|
||||
away_logo = f"https://a.espncdn.com/i/teamlogos/soccer/500/{away_id}.png" if away_id else None
|
||||
|
||||
# ── Competition ────────────────────────────────
|
||||
comp_raw = cells[5].get_text(strip=True) if len(cells) > 5 else \
|
||||
cells[4].get_text(strip=True)
|
||||
# Strip "FT" from competition if it crept in
|
||||
if comp_raw in ("FT", "AET", "Pen"):
|
||||
comp_raw = cells[5].get_text(strip=True) if len(cells) > 5 else "—"
|
||||
abbr = COMP_MAP.get(comp_raw, comp_raw[:2].upper() if comp_raw not in ("—", "") else "—")
|
||||
|
||||
# ── Porto perspective ──────────────────────────
|
||||
is_home = home.lower() in porto_names
|
||||
opponent = away if is_home else home
|
||||
opponent_logo = away_logo if is_home else home_logo
|
||||
|
||||
# ── Score / time ───────────────────────────────────
|
||||
score_cell = cells[3] if len(cells) > 3 else cells[2]
|
||||
score_text = score_cell.get_text(strip=True)
|
||||
# ── Score vs time ──────────────────────────────
|
||||
score_str = None
|
||||
time_str = "TBD"
|
||||
if is_past:
|
||||
if re.match(r"\d+\s*-\s*\d+", mid_text):
|
||||
score_str = mid_text.replace(" ", "")
|
||||
else:
|
||||
continue # skip if no valid score
|
||||
else:
|
||||
time_str = mid_text if mid_text not in ("", "TBD") else "TBD"
|
||||
|
||||
if is_past and re.match(r"\d+\s*-\s*\d+", score_text):
|
||||
score_str = score_text.replace(" ", "")
|
||||
elif not is_past:
|
||||
time_str = score_text if score_text not in ("", "TBD") else "TBD"
|
||||
|
||||
# ── Competition ────────────────────────────────────
|
||||
comp_raw = cells[5].get_text(strip=True) if len(cells) > 5 else "—"
|
||||
abbr = COMP_MAP.get(comp_raw, comp_raw[:2].upper() if comp_raw != "—" else "—")
|
||||
|
||||
# ── Date parse ─────────────────────────────────────
|
||||
# ── Parse date ─────────────────────────────────
|
||||
dt = None
|
||||
year = datetime.now(PT_TZ).year
|
||||
for fmt in ("%a, %b %d %Y", "%A, %B %d %Y"):
|
||||
try:
|
||||
dt = datetime.strptime(f"{date_text} {year}", fmt)
|
||||
dt = dt.replace(tzinfo=PT_TZ)
|
||||
# Roll over year if needed
|
||||
if dt < datetime.now(PT_TZ).replace(month=1, day=1):
|
||||
now = datetime.now(PT_TZ)
|
||||
# roll year forward for future dates that wrap
|
||||
if not is_past and dt.month < now.month - 2:
|
||||
dt = dt.replace(year=year + 1)
|
||||
break
|
||||
except ValueError:
|
||||
@@ -147,12 +164,13 @@ def scrape() -> list[dict]:
|
||||
"score": score_str,
|
||||
"is_past": is_past,
|
||||
})
|
||||
count += 1
|
||||
|
||||
except Exception as e:
|
||||
log.debug("Skipping row: %s", e)
|
||||
continue
|
||||
|
||||
log.info(" %s: parsed %d matches", mode, sum(1 for m in matches if m["is_past"] == is_past))
|
||||
log.info(" %s: %d matches parsed", mode, count)
|
||||
|
||||
matches.sort(key=lambda x: x["timestamp"])
|
||||
log.info("Total: %d past + %d future",
|
||||
@@ -162,7 +180,7 @@ def scrape() -> list[dict]:
|
||||
|
||||
|
||||
def build_output(matches: list[dict]) -> dict:
|
||||
past = [m for m in matches if m["is_past"] and m.get("score")]
|
||||
past = [m for m in matches if m["is_past"]]
|
||||
future = [m for m in matches if not m["is_past"]]
|
||||
|
||||
last2 = past[-2:] if len(past) >= 2 else past
|
||||
|
||||
Reference in New Issue
Block a user