Working version
This commit is contained in:
191
src/epl.py
Normal file
191
src/epl.py
Normal file
@@ -0,0 +1,191 @@
|
||||
from bs4 import BeautifulSoup
|
||||
import pandas as pd
|
||||
import re
|
||||
from datetime import date
|
||||
from lxml import etree
|
||||
import pandas as pd
|
||||
import re
|
||||
from pathlib import Path
|
||||
from datetime import datetime, date, time, timedelta, timezone
|
||||
from zoneinfo import ZoneInfo # Python 3.9+
|
||||
from great_tables import GT, md
|
||||
import os
|
||||
mlb_sched_url = os.getenv("EPL_SCHED_URL")
|
||||
import requests
|
||||
import cloudscraper
|
||||
driver = 'chrome'
|
||||
|
||||
|
||||
import pandas as pd
|
||||
from bs4 import BeautifulSoup
|
||||
import pytz
|
||||
|
||||
# Load the HTML file
|
||||
headers = {
|
||||
'Access-Control-Allow-Origin': '*',
|
||||
'Access-Control-Allow-Methods': 'GET',
|
||||
'Access-Control-Allow-Headers': 'Content-Type',
|
||||
'Access-Control-Max-Age': '3600',
|
||||
'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0'
|
||||
}
|
||||
|
||||
url = f'https://fbref.com/en/comps/9/schedule/Premier-League-Scores-and-Fixtures'
|
||||
# Fetch the webpage content
|
||||
scraper = cloudscraper.create_scraper()
|
||||
|
||||
response = scraper.get(url)
|
||||
|
||||
# Parse the HTML content using BeautifulSoup
|
||||
soup = BeautifulSoup(response.content, 'html.parser')
|
||||
|
||||
# Extract tables
|
||||
tables = pd.read_html(response.text)
|
||||
soccer_schedule = tables[0]
|
||||
|
||||
# --- Clean up Date/Time ---
|
||||
# Combine Date + Time
|
||||
soccer_schedule["DateTime_UK"] = pd.to_datetime(
|
||||
soccer_schedule["Date"].astype(str) + " " + soccer_schedule["Time"].astype(str),
|
||||
errors="coerce"
|
||||
)
|
||||
|
||||
# Localize to UK time
|
||||
uk = pytz.timezone("Europe/London")
|
||||
ny = pytz.timezone("America/New_York")
|
||||
soccer_schedule["DateTime_UK"] = soccer_schedule["DateTime_UK"].dt.tz_localize(uk, ambiguous="NaT", nonexistent="NaT")
|
||||
|
||||
# Convert to New York time
|
||||
soccer_schedule["DateTime_NY"] = soccer_schedule["DateTime_UK"].dt.tz_convert(ny)
|
||||
|
||||
# Format for display in am/pm style
|
||||
soccer_schedule["NY_Time"] = soccer_schedule["DateTime_NY"].dt.strftime("%I:%M %p")
|
||||
|
||||
# Drop leading zeros from hour (optional)
|
||||
soccer_schedule["NY_Time"] = soccer_schedule["NY_Time"].str.lstrip("0")
|
||||
|
||||
# Show final
|
||||
soccer_schedule = soccer_schedule.replace("Nott'ham Forest","Nottingham Forest")
|
||||
|
||||
|
||||
import pandas as pd
|
||||
import xml.etree.ElementTree as ET
|
||||
from datetime import date
|
||||
|
||||
# Fix team naming
|
||||
soccer_schedule = soccer_schedule.replace("Nott'ham Forest", "Nottingham Forest")
|
||||
soccer_schedule = soccer_schedule.replace("Newcastle Utd", "Newcastle United")
|
||||
soccer_schedule = soccer_schedule.replace("Wolves", "Wolverhampton Wanderers")
|
||||
|
||||
# Filter today's schedule & keep merge_key
|
||||
df_today = (
|
||||
soccer_schedule[
|
||||
pd.to_datetime(soccer_schedule.Date) == pd.to_datetime(date.today())
|
||||
]
|
||||
.reset_index()
|
||||
.rename({'index': 'merge_key'}, axis='columns')
|
||||
)
|
||||
|
||||
df_today = df_today[['Date', 'NY_Time', 'Away', 'Home', 'Wk', 'merge_key']]
|
||||
|
||||
if len(df_today)>0:
|
||||
# Load XML
|
||||
file_path = "data/epg.xml" # replace with actual path
|
||||
tree = ET.parse(file_path)
|
||||
root = tree.getroot()
|
||||
|
||||
data = []
|
||||
|
||||
# Iterate schedule rows
|
||||
for _, row in df_today.iterrows():
|
||||
home = str(row["Home"]).strip()
|
||||
away = str(row["Away"]).strip()
|
||||
|
||||
for prog in root.findall("programme"):
|
||||
title = prog.find("title").text if prog.find("title") is not None else ""
|
||||
desc = prog.find("desc").text if prog.find("desc") is not None else ""
|
||||
|
||||
# Keep only if Premier League and both team names appear
|
||||
if "Premier League" in title and home in title and away in title:
|
||||
data.append({
|
||||
"merge_key": row["merge_key"], # ✅ carry over merge_key
|
||||
"Schedule_Home": home,
|
||||
"Schedule_Away": away,
|
||||
"Start": prog.attrib.get("start"),
|
||||
"Stop": prog.attrib.get("stop"),
|
||||
"Channel": prog.attrib.get("channel"),
|
||||
"Title": title.strip() })
|
||||
|
||||
# Create DataFrame
|
||||
df_matches = pd.DataFrame(data)
|
||||
|
||||
# Convert start/stop to datetime
|
||||
df_matches["Start"] = pd.to_datetime(df_matches["Start"].str[:14], format="%Y%m%d%H%M%S", errors="coerce", utc=True)
|
||||
df_matches["Stop"] = pd.to_datetime(df_matches["Stop"].str[:14], format="%Y%m%d%H%M%S", errors="coerce", utc=True)
|
||||
|
||||
df = pd.merge(
|
||||
df_today,
|
||||
df_matches[['merge_key','Schedule_Home','Schedule_Away','Channel']].drop_duplicates(),
|
||||
how='left',
|
||||
on='merge_key'
|
||||
)
|
||||
|
||||
|
||||
####
|
||||
|
||||
for game in df['merge_key'].unique():
|
||||
temp = df[df['merge_key']==game].drop_duplicates().reset_index(drop=True)
|
||||
temp = temp.rename({'NY_Time':'Time'},axis='columns')
|
||||
temp['Wk'] = temp.Wk.astype('int')
|
||||
|
||||
temp2 = temp[['Channel']].drop_duplicates().reset_index(drop=True)
|
||||
|
||||
# Split once on "." into two columns
|
||||
split_cols = temp2['Channel'].str.split('.', n=1, expand=True)
|
||||
temp2['Channel'] = split_cols[0] # part before "."
|
||||
temp2['Country'] = split_cols[1].str.upper().fillna("") # part after ".", or "" if missing
|
||||
|
||||
# Reorder
|
||||
temp2 = temp2[['Country', 'Channel']]
|
||||
|
||||
# Regex pattern: 'FOX' + 4 uppercase letters (total length 7, all caps)
|
||||
temp2['Country'] = temp2['Country'].replace('US2','US')
|
||||
pattern = re.compile(r"^NBC[A-Z]{4}$")
|
||||
# Replace matching channels with 'FOX'
|
||||
temp2["Channel"] = temp2["Channel"].apply(lambda x: "NBC" if pattern.match(x) else x)
|
||||
pattern = re.compile(r"^NBC[A-Z]{5}$")
|
||||
# Replace matching channels with 'FOX'
|
||||
temp2["Channel"] = temp2["Channel"].apply(lambda x: "NBC" if pattern.match(x) else x)
|
||||
pattern = re.compile(r"^NBC[A-Z]{6}$")
|
||||
# Replace matching channels with 'FOX'
|
||||
temp2["Channel"] = temp2["Channel"].apply(lambda x: "NBC" if pattern.match(x) else x)
|
||||
pattern = re.compile(r"^NBC[A-Z]{3}$")
|
||||
# Replace matching channels with 'FOX'
|
||||
temp2["Channel"] = temp2["Channel"].apply(lambda x: "NBC" if pattern.match(x) else x)
|
||||
|
||||
|
||||
temp2 = temp2.drop_duplicates()
|
||||
###
|
||||
|
||||
game_table = (
|
||||
GT(temp2.drop_duplicates())
|
||||
.tab_header(
|
||||
title=f"{temp['Away'][0]} @ {temp['Home'][0]}",
|
||||
subtitle=f"Matchweek {temp.Wk[0]} {temp['Time'][0]} {str(temp['Date'][0]).split(' ')[0]}",
|
||||
).tab_source_note(md(' '))
|
||||
)
|
||||
game_table_image_path = 'data/epl.png'
|
||||
game_table.save(game_table_image_path,window_size=(1000, 1000),web_driver=driver)
|
||||
|
||||
with open(game_table_image_path, 'rb') as image_file:
|
||||
# Prepare the payload with the file
|
||||
payload = {
|
||||
'file': image_file
|
||||
}
|
||||
|
||||
# Send the POST request to the webhook
|
||||
response = requests.post(epl_sched_url, files=payload)
|
||||
|
||||
|
||||
os.remove(game_table_image_path)
|
||||
|
||||
|
Reference in New Issue
Block a user