Working version

This commit is contained in:
2025-09-15 11:36:43 -04:00
parent 3d4afe7eab
commit 2003b9d115
10 changed files with 968 additions and 0 deletions

191
src/epl.py Normal file
View File

@@ -0,0 +1,191 @@
from bs4 import BeautifulSoup
import pandas as pd
import re
from datetime import date
from lxml import etree
import pandas as pd
import re
from pathlib import Path
from datetime import datetime, date, time, timedelta, timezone
from zoneinfo import ZoneInfo # Python 3.9+
from great_tables import GT, md
import os
mlb_sched_url = os.getenv("EPL_SCHED_URL")
import requests
import cloudscraper
driver = 'chrome'
import pandas as pd
from bs4 import BeautifulSoup
import pytz
# Load the HTML file
headers = {
'Access-Control-Allow-Origin': '*',
'Access-Control-Allow-Methods': 'GET',
'Access-Control-Allow-Headers': 'Content-Type',
'Access-Control-Max-Age': '3600',
'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0'
}
url = f'https://fbref.com/en/comps/9/schedule/Premier-League-Scores-and-Fixtures'
# Fetch the webpage content
scraper = cloudscraper.create_scraper()
response = scraper.get(url)
# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(response.content, 'html.parser')
# Extract tables
tables = pd.read_html(response.text)
soccer_schedule = tables[0]
# --- Clean up Date/Time ---
# Combine Date + Time
soccer_schedule["DateTime_UK"] = pd.to_datetime(
soccer_schedule["Date"].astype(str) + " " + soccer_schedule["Time"].astype(str),
errors="coerce"
)
# Localize to UK time
uk = pytz.timezone("Europe/London")
ny = pytz.timezone("America/New_York")
soccer_schedule["DateTime_UK"] = soccer_schedule["DateTime_UK"].dt.tz_localize(uk, ambiguous="NaT", nonexistent="NaT")
# Convert to New York time
soccer_schedule["DateTime_NY"] = soccer_schedule["DateTime_UK"].dt.tz_convert(ny)
# Format for display in am/pm style
soccer_schedule["NY_Time"] = soccer_schedule["DateTime_NY"].dt.strftime("%I:%M %p")
# Drop leading zeros from hour (optional)
soccer_schedule["NY_Time"] = soccer_schedule["NY_Time"].str.lstrip("0")
# Show final
soccer_schedule = soccer_schedule.replace("Nott'ham Forest","Nottingham Forest")
import pandas as pd
import xml.etree.ElementTree as ET
from datetime import date
# Fix team naming
soccer_schedule = soccer_schedule.replace("Nott'ham Forest", "Nottingham Forest")
soccer_schedule = soccer_schedule.replace("Newcastle Utd", "Newcastle United")
soccer_schedule = soccer_schedule.replace("Wolves", "Wolverhampton Wanderers")
# Filter today's schedule & keep merge_key
df_today = (
soccer_schedule[
pd.to_datetime(soccer_schedule.Date) == pd.to_datetime(date.today())
]
.reset_index()
.rename({'index': 'merge_key'}, axis='columns')
)
df_today = df_today[['Date', 'NY_Time', 'Away', 'Home', 'Wk', 'merge_key']]
if len(df_today)>0:
# Load XML
file_path = "data/epg.xml" # replace with actual path
tree = ET.parse(file_path)
root = tree.getroot()
data = []
# Iterate schedule rows
for _, row in df_today.iterrows():
home = str(row["Home"]).strip()
away = str(row["Away"]).strip()
for prog in root.findall("programme"):
title = prog.find("title").text if prog.find("title") is not None else ""
desc = prog.find("desc").text if prog.find("desc") is not None else ""
# Keep only if Premier League and both team names appear
if "Premier League" in title and home in title and away in title:
data.append({
"merge_key": row["merge_key"], # ✅ carry over merge_key
"Schedule_Home": home,
"Schedule_Away": away,
"Start": prog.attrib.get("start"),
"Stop": prog.attrib.get("stop"),
"Channel": prog.attrib.get("channel"),
"Title": title.strip() })
# Create DataFrame
df_matches = pd.DataFrame(data)
# Convert start/stop to datetime
df_matches["Start"] = pd.to_datetime(df_matches["Start"].str[:14], format="%Y%m%d%H%M%S", errors="coerce", utc=True)
df_matches["Stop"] = pd.to_datetime(df_matches["Stop"].str[:14], format="%Y%m%d%H%M%S", errors="coerce", utc=True)
df = pd.merge(
df_today,
df_matches[['merge_key','Schedule_Home','Schedule_Away','Channel']].drop_duplicates(),
how='left',
on='merge_key'
)
####
for game in df['merge_key'].unique():
temp = df[df['merge_key']==game].drop_duplicates().reset_index(drop=True)
temp = temp.rename({'NY_Time':'Time'},axis='columns')
temp['Wk'] = temp.Wk.astype('int')
temp2 = temp[['Channel']].drop_duplicates().reset_index(drop=True)
# Split once on "." into two columns
split_cols = temp2['Channel'].str.split('.', n=1, expand=True)
temp2['Channel'] = split_cols[0] # part before "."
temp2['Country'] = split_cols[1].str.upper().fillna("") # part after ".", or "" if missing
# Reorder
temp2 = temp2[['Country', 'Channel']]
# Regex pattern: 'FOX' + 4 uppercase letters (total length 7, all caps)
temp2['Country'] = temp2['Country'].replace('US2','US')
pattern = re.compile(r"^NBC[A-Z]{4}$")
# Replace matching channels with 'FOX'
temp2["Channel"] = temp2["Channel"].apply(lambda x: "NBC" if pattern.match(x) else x)
pattern = re.compile(r"^NBC[A-Z]{5}$")
# Replace matching channels with 'FOX'
temp2["Channel"] = temp2["Channel"].apply(lambda x: "NBC" if pattern.match(x) else x)
pattern = re.compile(r"^NBC[A-Z]{6}$")
# Replace matching channels with 'FOX'
temp2["Channel"] = temp2["Channel"].apply(lambda x: "NBC" if pattern.match(x) else x)
pattern = re.compile(r"^NBC[A-Z]{3}$")
# Replace matching channels with 'FOX'
temp2["Channel"] = temp2["Channel"].apply(lambda x: "NBC" if pattern.match(x) else x)
temp2 = temp2.drop_duplicates()
###
game_table = (
GT(temp2.drop_duplicates())
.tab_header(
title=f"{temp['Away'][0]} @ {temp['Home'][0]}",
subtitle=f"Matchweek {temp.Wk[0]} {temp['Time'][0]} {str(temp['Date'][0]).split(' ')[0]}",
).tab_source_note(md(' '))
)
game_table_image_path = 'data/epl.png'
game_table.save(game_table_image_path,window_size=(1000, 1000),web_driver=driver)
with open(game_table_image_path, 'rb') as image_file:
# Prepare the payload with the file
payload = {
'file': image_file
}
# Send the POST request to the webhook
response = requests.post(epl_sched_url, files=payload)
os.remove(game_table_image_path)