Update scraper/scraper.py
All checks were successful
Build & Push Football Docker Images / build-push-update (push) Successful in 7s
All checks were successful
Build & Push Football Docker Images / build-push-update (push) Successful in 7s
This commit is contained in:
@@ -48,8 +48,16 @@ HEADERS = {
|
|||||||
def scrape() -> list[dict]:
|
def scrape() -> list[dict]:
|
||||||
log.info("Scraping ESPN for %s (id=%s)...", TEAM_NAME, TEAM_ID)
|
log.info("Scraping ESPN for %s (id=%s)...", TEAM_NAME, TEAM_ID)
|
||||||
|
|
||||||
base = "https://www.espn.com/soccer/team"
|
# ESPN table structure:
|
||||||
urls = {
|
# cell[0] = DATE
|
||||||
|
# cell[1] = HOME TEAM (link)
|
||||||
|
# cell[2] = SCORE or TIME (link for past, text for future)
|
||||||
|
# cell[3] = AWAY TEAM (link)
|
||||||
|
# cell[4] = FT / result status (past) | competition (future some)
|
||||||
|
# cell[5] = COMPETITION
|
||||||
|
|
||||||
|
base = "https://www.espn.com/soccer/team"
|
||||||
|
pages = {
|
||||||
"future": f"{base}/fixtures/_/id/{TEAM_ID}/{TEAM_SLUG}",
|
"future": f"{base}/fixtures/_/id/{TEAM_ID}/{TEAM_SLUG}",
|
||||||
"past": f"{base}/results/_/id/{TEAM_ID}/{TEAM_SLUG}",
|
"past": f"{base}/results/_/id/{TEAM_ID}/{TEAM_SLUG}",
|
||||||
}
|
}
|
||||||
@@ -57,7 +65,7 @@ def scrape() -> list[dict]:
|
|||||||
porto_names = {"fc porto", "porto"}
|
porto_names = {"fc porto", "porto"}
|
||||||
matches = []
|
matches = []
|
||||||
|
|
||||||
for mode, url in urls.items():
|
for mode, url in pages.items():
|
||||||
is_past = mode == "past"
|
is_past = mode == "past"
|
||||||
|
|
||||||
try:
|
try:
|
||||||
@@ -68,64 +76,73 @@ def scrape() -> list[dict]:
|
|||||||
log.error("Failed to fetch %s: %s", url, e)
|
log.error("Failed to fetch %s: %s", url, e)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
count = 0
|
||||||
for table in soup.find_all("table"):
|
for table in soup.find_all("table"):
|
||||||
for row in table.find_all("tr"):
|
for row in table.find_all("tr"):
|
||||||
cells = row.find_all("td")
|
cells = row.find_all("td")
|
||||||
if len(cells) < 4:
|
if len(cells) < 5:
|
||||||
continue
|
continue
|
||||||
try:
|
try:
|
||||||
# ── Date ──────────────────────────────────────────
|
# ── Date ──────────────────────────────────────
|
||||||
date_text = cells[0].get_text(strip=True)
|
date_text = cells[0].get_text(strip=True)
|
||||||
|
if not date_text or date_text == "DATE":
|
||||||
# ── Teams + logos ──────────────────────────────────
|
|
||||||
match_cell = cells[1]
|
|
||||||
team_links = match_cell.find_all(
|
|
||||||
"a", href=re.compile(r"/soccer/team/_/id/\d+")
|
|
||||||
)
|
|
||||||
if len(team_links) < 2:
|
|
||||||
continue
|
continue
|
||||||
|
|
||||||
home = team_links[0].get_text(strip=True)
|
# ── Home team ──────────────────────────────────
|
||||||
away = team_links[-1].get_text(strip=True)
|
home_link = cells[1].find("a", href=re.compile(r"/soccer/team/_/id/\d+"))
|
||||||
home_id = re.search(r"/id/(\d+)/", team_links[0]["href"])
|
if not home_link:
|
||||||
away_id = re.search(r"/id/(\d+)/", team_links[-1]["href"])
|
continue
|
||||||
home_logo = (
|
home = home_link.get_text(strip=True)
|
||||||
f"https://a.espncdn.com/i/teamlogos/soccer/500/{home_id.group(1)}.png"
|
home_m = re.search(r"/id/(\d+)/", home_link["href"])
|
||||||
if home_id else None
|
home_id = home_m.group(1) if home_m else None
|
||||||
)
|
home_logo = f"https://a.espncdn.com/i/teamlogos/soccer/500/{home_id}.png" if home_id else None
|
||||||
away_logo = (
|
|
||||||
f"https://a.espncdn.com/i/teamlogos/soccer/500/{away_id.group(1)}.png"
|
|
||||||
if away_id else None
|
|
||||||
)
|
|
||||||
|
|
||||||
|
# ── Score / time (cell[2]) ─────────────────────
|
||||||
|
mid_text = cells[2].get_text(strip=True)
|
||||||
|
|
||||||
|
# ── Away team ──────────────────────────────────
|
||||||
|
away_link = cells[3].find("a", href=re.compile(r"/soccer/team/_/id/\d+"))
|
||||||
|
if not away_link:
|
||||||
|
continue
|
||||||
|
away = away_link.get_text(strip=True)
|
||||||
|
away_m = re.search(r"/id/(\d+)/", away_link["href"])
|
||||||
|
away_id = away_m.group(1) if away_m else None
|
||||||
|
away_logo = f"https://a.espncdn.com/i/teamlogos/soccer/500/{away_id}.png" if away_id else None
|
||||||
|
|
||||||
|
# ── Competition ────────────────────────────────
|
||||||
|
comp_raw = cells[5].get_text(strip=True) if len(cells) > 5 else \
|
||||||
|
cells[4].get_text(strip=True)
|
||||||
|
# Strip "FT" from competition if it crept in
|
||||||
|
if comp_raw in ("FT", "AET", "Pen"):
|
||||||
|
comp_raw = cells[5].get_text(strip=True) if len(cells) > 5 else "—"
|
||||||
|
abbr = COMP_MAP.get(comp_raw, comp_raw[:2].upper() if comp_raw not in ("—", "") else "—")
|
||||||
|
|
||||||
|
# ── Porto perspective ──────────────────────────
|
||||||
is_home = home.lower() in porto_names
|
is_home = home.lower() in porto_names
|
||||||
opponent = away if is_home else home
|
opponent = away if is_home else home
|
||||||
opponent_logo = away_logo if is_home else home_logo
|
opponent_logo = away_logo if is_home else home_logo
|
||||||
|
|
||||||
# ── Score / time ───────────────────────────────────
|
# ── Score vs time ──────────────────────────────
|
||||||
score_cell = cells[3] if len(cells) > 3 else cells[2]
|
score_str = None
|
||||||
score_text = score_cell.get_text(strip=True)
|
time_str = "TBD"
|
||||||
score_str = None
|
if is_past:
|
||||||
time_str = "TBD"
|
if re.match(r"\d+\s*-\s*\d+", mid_text):
|
||||||
|
score_str = mid_text.replace(" ", "")
|
||||||
|
else:
|
||||||
|
continue # skip if no valid score
|
||||||
|
else:
|
||||||
|
time_str = mid_text if mid_text not in ("", "TBD") else "TBD"
|
||||||
|
|
||||||
if is_past and re.match(r"\d+\s*-\s*\d+", score_text):
|
# ── Parse date ─────────────────────────────────
|
||||||
score_str = score_text.replace(" ", "")
|
|
||||||
elif not is_past:
|
|
||||||
time_str = score_text if score_text not in ("", "TBD") else "TBD"
|
|
||||||
|
|
||||||
# ── Competition ────────────────────────────────────
|
|
||||||
comp_raw = cells[5].get_text(strip=True) if len(cells) > 5 else "—"
|
|
||||||
abbr = COMP_MAP.get(comp_raw, comp_raw[:2].upper() if comp_raw != "—" else "—")
|
|
||||||
|
|
||||||
# ── Date parse ─────────────────────────────────────
|
|
||||||
dt = None
|
dt = None
|
||||||
year = datetime.now(PT_TZ).year
|
year = datetime.now(PT_TZ).year
|
||||||
for fmt in ("%a, %b %d %Y", "%A, %B %d %Y"):
|
for fmt in ("%a, %b %d %Y", "%A, %B %d %Y"):
|
||||||
try:
|
try:
|
||||||
dt = datetime.strptime(f"{date_text} {year}", fmt)
|
dt = datetime.strptime(f"{date_text} {year}", fmt)
|
||||||
dt = dt.replace(tzinfo=PT_TZ)
|
dt = dt.replace(tzinfo=PT_TZ)
|
||||||
# Roll over year if needed
|
now = datetime.now(PT_TZ)
|
||||||
if dt < datetime.now(PT_TZ).replace(month=1, day=1):
|
# roll year forward for future dates that wrap
|
||||||
|
if not is_past and dt.month < now.month - 2:
|
||||||
dt = dt.replace(year=year + 1)
|
dt = dt.replace(year=year + 1)
|
||||||
break
|
break
|
||||||
except ValueError:
|
except ValueError:
|
||||||
@@ -147,22 +164,23 @@ def scrape() -> list[dict]:
|
|||||||
"score": score_str,
|
"score": score_str,
|
||||||
"is_past": is_past,
|
"is_past": is_past,
|
||||||
})
|
})
|
||||||
|
count += 1
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
log.debug("Skipping row: %s", e)
|
log.debug("Skipping row: %s", e)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
log.info(" %s: parsed %d matches", mode, sum(1 for m in matches if m["is_past"] == is_past))
|
log.info(" %s: %d matches parsed", mode, count)
|
||||||
|
|
||||||
matches.sort(key=lambda x: x["timestamp"])
|
matches.sort(key=lambda x: x["timestamp"])
|
||||||
log.info("Total: %d past + %d future",
|
log.info("Total: %d past + %d future",
|
||||||
sum(1 for m in matches if m["is_past"]),
|
sum(1 for m in matches if m["is_past"]),
|
||||||
sum(1 for m in matches if not m["is_past"]))
|
sum(1 for m in matches if not m["is_past"]))
|
||||||
return matches
|
return matches
|
||||||
|
|
||||||
|
|
||||||
def build_output(matches: list[dict]) -> dict:
|
def build_output(matches: list[dict]) -> dict:
|
||||||
past = [m for m in matches if m["is_past"] and m.get("score")]
|
past = [m for m in matches if m["is_past"]]
|
||||||
future = [m for m in matches if not m["is_past"]]
|
future = [m for m in matches if not m["is_past"]]
|
||||||
|
|
||||||
last2 = past[-2:] if len(past) >= 2 else past
|
last2 = past[-2:] if len(past) >= 2 else past
|
||||||
|
|||||||
Reference in New Issue
Block a user