192 lines
6.5 KiB
Python
192 lines
6.5 KiB
Python
from bs4 import BeautifulSoup
|
|
import pandas as pd
|
|
import re
|
|
from datetime import date
|
|
from lxml import etree
|
|
import pandas as pd
|
|
import re
|
|
from pathlib import Path
|
|
from datetime import datetime, date, time, timedelta, timezone
|
|
from zoneinfo import ZoneInfo # Python 3.9+
|
|
from great_tables import GT, md
|
|
import os
|
|
mlb_sched_url = os.getenv("EPL_SCHED_URL")
|
|
import requests
|
|
import cloudscraper
|
|
driver = 'chrome'
|
|
|
|
|
|
import pandas as pd
|
|
from bs4 import BeautifulSoup
|
|
import pytz
|
|
|
|
# Load the HTML file
|
|
headers = {
|
|
'Access-Control-Allow-Origin': '*',
|
|
'Access-Control-Allow-Methods': 'GET',
|
|
'Access-Control-Allow-Headers': 'Content-Type',
|
|
'Access-Control-Max-Age': '3600',
|
|
'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0'
|
|
}
|
|
|
|
url = f'https://fbref.com/en/comps/9/schedule/Premier-League-Scores-and-Fixtures'
|
|
# Fetch the webpage content
|
|
scraper = cloudscraper.create_scraper()
|
|
|
|
response = scraper.get(url)
|
|
|
|
# Parse the HTML content using BeautifulSoup
|
|
soup = BeautifulSoup(response.content, 'html.parser')
|
|
|
|
# Extract tables
|
|
tables = pd.read_html(response.text)
|
|
soccer_schedule = tables[0]
|
|
|
|
# --- Clean up Date/Time ---
|
|
# Combine Date + Time
|
|
soccer_schedule["DateTime_UK"] = pd.to_datetime(
|
|
soccer_schedule["Date"].astype(str) + " " + soccer_schedule["Time"].astype(str),
|
|
errors="coerce"
|
|
)
|
|
|
|
# Localize to UK time
|
|
uk = pytz.timezone("Europe/London")
|
|
ny = pytz.timezone("America/New_York")
|
|
soccer_schedule["DateTime_UK"] = soccer_schedule["DateTime_UK"].dt.tz_localize(uk, ambiguous="NaT", nonexistent="NaT")
|
|
|
|
# Convert to New York time
|
|
soccer_schedule["DateTime_NY"] = soccer_schedule["DateTime_UK"].dt.tz_convert(ny)
|
|
|
|
# Format for display in am/pm style
|
|
soccer_schedule["NY_Time"] = soccer_schedule["DateTime_NY"].dt.strftime("%I:%M %p")
|
|
|
|
# Drop leading zeros from hour (optional)
|
|
soccer_schedule["NY_Time"] = soccer_schedule["NY_Time"].str.lstrip("0")
|
|
|
|
# Show final
|
|
soccer_schedule = soccer_schedule.replace("Nott'ham Forest","Nottingham Forest")
|
|
|
|
|
|
import pandas as pd
|
|
import xml.etree.ElementTree as ET
|
|
from datetime import date
|
|
|
|
# Fix team naming
|
|
soccer_schedule = soccer_schedule.replace("Nott'ham Forest", "Nottingham Forest")
|
|
soccer_schedule = soccer_schedule.replace("Newcastle Utd", "Newcastle United")
|
|
soccer_schedule = soccer_schedule.replace("Wolves", "Wolverhampton Wanderers")
|
|
|
|
# Filter today's schedule & keep merge_key
|
|
df_today = (
|
|
soccer_schedule[
|
|
pd.to_datetime(soccer_schedule.Date) == pd.to_datetime(date.today())
|
|
]
|
|
.reset_index()
|
|
.rename({'index': 'merge_key'}, axis='columns')
|
|
)
|
|
|
|
df_today = df_today[['Date', 'NY_Time', 'Away', 'Home', 'Wk', 'merge_key']]
|
|
|
|
if len(df_today)>0:
|
|
# Load XML
|
|
file_path = "data/epg.xml" # replace with actual path
|
|
tree = ET.parse(file_path)
|
|
root = tree.getroot()
|
|
|
|
data = []
|
|
|
|
# Iterate schedule rows
|
|
for _, row in df_today.iterrows():
|
|
home = str(row["Home"]).strip()
|
|
away = str(row["Away"]).strip()
|
|
|
|
for prog in root.findall("programme"):
|
|
title = prog.find("title").text if prog.find("title") is not None else ""
|
|
desc = prog.find("desc").text if prog.find("desc") is not None else ""
|
|
|
|
# Keep only if Premier League and both team names appear
|
|
if "Premier League" in title and home in title and away in title:
|
|
data.append({
|
|
"merge_key": row["merge_key"], # ✅ carry over merge_key
|
|
"Schedule_Home": home,
|
|
"Schedule_Away": away,
|
|
"Start": prog.attrib.get("start"),
|
|
"Stop": prog.attrib.get("stop"),
|
|
"Channel": prog.attrib.get("channel"),
|
|
"Title": title.strip() })
|
|
|
|
# Create DataFrame
|
|
df_matches = pd.DataFrame(data)
|
|
|
|
# Convert start/stop to datetime
|
|
df_matches["Start"] = pd.to_datetime(df_matches["Start"].str[:14], format="%Y%m%d%H%M%S", errors="coerce", utc=True)
|
|
df_matches["Stop"] = pd.to_datetime(df_matches["Stop"].str[:14], format="%Y%m%d%H%M%S", errors="coerce", utc=True)
|
|
|
|
df = pd.merge(
|
|
df_today,
|
|
df_matches[['merge_key','Schedule_Home','Schedule_Away','Channel']].drop_duplicates(),
|
|
how='left',
|
|
on='merge_key'
|
|
)
|
|
|
|
|
|
####
|
|
|
|
for game in df['merge_key'].unique():
|
|
temp = df[df['merge_key']==game].drop_duplicates().reset_index(drop=True)
|
|
temp = temp.rename({'NY_Time':'Time'},axis='columns')
|
|
temp['Wk'] = temp.Wk.astype('int')
|
|
|
|
temp2 = temp[['Channel']].drop_duplicates().reset_index(drop=True)
|
|
|
|
# Split once on "." into two columns
|
|
split_cols = temp2['Channel'].str.split('.', n=1, expand=True)
|
|
temp2['Channel'] = split_cols[0] # part before "."
|
|
temp2['Country'] = split_cols[1].str.upper().fillna("") # part after ".", or "" if missing
|
|
|
|
# Reorder
|
|
temp2 = temp2[['Country', 'Channel']]
|
|
|
|
# Regex pattern: 'FOX' + 4 uppercase letters (total length 7, all caps)
|
|
temp2['Country'] = temp2['Country'].replace('US2','US')
|
|
pattern = re.compile(r"^NBC[A-Z]{4}$")
|
|
# Replace matching channels with 'FOX'
|
|
temp2["Channel"] = temp2["Channel"].apply(lambda x: "NBC" if pattern.match(x) else x)
|
|
pattern = re.compile(r"^NBC[A-Z]{5}$")
|
|
# Replace matching channels with 'FOX'
|
|
temp2["Channel"] = temp2["Channel"].apply(lambda x: "NBC" if pattern.match(x) else x)
|
|
pattern = re.compile(r"^NBC[A-Z]{6}$")
|
|
# Replace matching channels with 'FOX'
|
|
temp2["Channel"] = temp2["Channel"].apply(lambda x: "NBC" if pattern.match(x) else x)
|
|
pattern = re.compile(r"^NBC[A-Z]{3}$")
|
|
# Replace matching channels with 'FOX'
|
|
temp2["Channel"] = temp2["Channel"].apply(lambda x: "NBC" if pattern.match(x) else x)
|
|
|
|
|
|
temp2 = temp2.drop_duplicates()
|
|
###
|
|
|
|
game_table = (
|
|
GT(temp2.drop_duplicates())
|
|
.tab_header(
|
|
title=f"{temp['Away'][0]} @ {temp['Home'][0]}",
|
|
subtitle=f"Matchweek {temp.Wk[0]} {temp['Time'][0]} {str(temp['Date'][0]).split(' ')[0]}",
|
|
).tab_source_note(md(' '))
|
|
)
|
|
game_table_image_path = 'data/epl.png'
|
|
game_table.save(game_table_image_path,window_size=(1000, 1000),web_driver=driver)
|
|
|
|
with open(game_table_image_path, 'rb') as image_file:
|
|
# Prepare the payload with the file
|
|
payload = {
|
|
'file': image_file
|
|
}
|
|
|
|
# Send the POST request to the webhook
|
|
response = requests.post(epl_sched_url, files=payload)
|
|
|
|
|
|
os.remove(game_table_image_path)
|
|
|
|
|