from bs4 import BeautifulSoup import pandas as pd import re from datetime import date from lxml import etree import pandas as pd import re from pathlib import Path from datetime import datetime, date, time, timedelta, timezone from zoneinfo import ZoneInfo # Python 3.9+ from great_tables import GT, md import os mlb_sched_url = os.getenv("EPL_SCHED_URL") import requests import cloudscraper driver = 'chrome' import pandas as pd from bs4 import BeautifulSoup import pytz # Load the HTML file headers = { 'Access-Control-Allow-Origin': '*', 'Access-Control-Allow-Methods': 'GET', 'Access-Control-Allow-Headers': 'Content-Type', 'Access-Control-Max-Age': '3600', 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0' } url = f'https://fbref.com/en/comps/9/schedule/Premier-League-Scores-and-Fixtures' # Fetch the webpage content scraper = cloudscraper.create_scraper() response = scraper.get(url) # Parse the HTML content using BeautifulSoup soup = BeautifulSoup(response.content, 'html.parser') # Extract tables tables = pd.read_html(response.text) soccer_schedule = tables[0] # --- Clean up Date/Time --- # Combine Date + Time soccer_schedule["DateTime_UK"] = pd.to_datetime( soccer_schedule["Date"].astype(str) + " " + soccer_schedule["Time"].astype(str), errors="coerce" ) # Localize to UK time uk = pytz.timezone("Europe/London") ny = pytz.timezone("America/New_York") soccer_schedule["DateTime_UK"] = soccer_schedule["DateTime_UK"].dt.tz_localize(uk, ambiguous="NaT", nonexistent="NaT") # Convert to New York time soccer_schedule["DateTime_NY"] = soccer_schedule["DateTime_UK"].dt.tz_convert(ny) # Format for display in am/pm style soccer_schedule["NY_Time"] = soccer_schedule["DateTime_NY"].dt.strftime("%I:%M %p") # Drop leading zeros from hour (optional) soccer_schedule["NY_Time"] = soccer_schedule["NY_Time"].str.lstrip("0") # Show final soccer_schedule = soccer_schedule.replace("Nott'ham Forest","Nottingham Forest") import pandas as pd import xml.etree.ElementTree as ET from datetime import date # Fix team naming soccer_schedule = soccer_schedule.replace("Nott'ham Forest", "Nottingham Forest") soccer_schedule = soccer_schedule.replace("Newcastle Utd", "Newcastle United") soccer_schedule = soccer_schedule.replace("Wolves", "Wolverhampton Wanderers") # Filter today's schedule & keep merge_key df_today = ( soccer_schedule[ pd.to_datetime(soccer_schedule.Date) == pd.to_datetime(date.today()) ] .reset_index() .rename({'index': 'merge_key'}, axis='columns') ) df_today = df_today[['Date', 'NY_Time', 'Away', 'Home', 'Wk', 'merge_key']] if len(df_today)>0: # Load XML file_path = "data/epg.xml" # replace with actual path tree = ET.parse(file_path) root = tree.getroot() data = [] # Iterate schedule rows for _, row in df_today.iterrows(): home = str(row["Home"]).strip() away = str(row["Away"]).strip() for prog in root.findall("programme"): title = prog.find("title").text if prog.find("title") is not None else "" desc = prog.find("desc").text if prog.find("desc") is not None else "" # Keep only if Premier League and both team names appear if "Premier League" in title and home in title and away in title: data.append({ "merge_key": row["merge_key"], # ✅ carry over merge_key "Schedule_Home": home, "Schedule_Away": away, "Start": prog.attrib.get("start"), "Stop": prog.attrib.get("stop"), "Channel": prog.attrib.get("channel"), "Title": title.strip() }) # Create DataFrame df_matches = pd.DataFrame(data) # Convert start/stop to datetime df_matches["Start"] = pd.to_datetime(df_matches["Start"].str[:14], format="%Y%m%d%H%M%S", errors="coerce", utc=True) df_matches["Stop"] = pd.to_datetime(df_matches["Stop"].str[:14], format="%Y%m%d%H%M%S", errors="coerce", utc=True) df = pd.merge( df_today, df_matches[['merge_key','Schedule_Home','Schedule_Away','Channel']].drop_duplicates(), how='left', on='merge_key' ) #### for game in df['merge_key'].unique(): temp = df[df['merge_key']==game].drop_duplicates().reset_index(drop=True) temp = temp.rename({'NY_Time':'Time'},axis='columns') temp['Wk'] = temp.Wk.astype('int') temp2 = temp[['Channel']].drop_duplicates().reset_index(drop=True) # Split once on "." into two columns split_cols = temp2['Channel'].str.split('.', n=1, expand=True) temp2['Channel'] = split_cols[0] # part before "." temp2['Country'] = split_cols[1].str.upper().fillna("") # part after ".", or "" if missing # Reorder temp2 = temp2[['Country', 'Channel']] # Regex pattern: 'FOX' + 4 uppercase letters (total length 7, all caps) temp2['Country'] = temp2['Country'].replace('US2','US') pattern = re.compile(r"^NBC[A-Z]{4}$") # Replace matching channels with 'FOX' temp2["Channel"] = temp2["Channel"].apply(lambda x: "NBC" if pattern.match(x) else x) pattern = re.compile(r"^NBC[A-Z]{5}$") # Replace matching channels with 'FOX' temp2["Channel"] = temp2["Channel"].apply(lambda x: "NBC" if pattern.match(x) else x) pattern = re.compile(r"^NBC[A-Z]{6}$") # Replace matching channels with 'FOX' temp2["Channel"] = temp2["Channel"].apply(lambda x: "NBC" if pattern.match(x) else x) pattern = re.compile(r"^NBC[A-Z]{3}$") # Replace matching channels with 'FOX' temp2["Channel"] = temp2["Channel"].apply(lambda x: "NBC" if pattern.match(x) else x) temp2 = temp2.drop_duplicates() ### game_table = ( GT(temp2.drop_duplicates()) .tab_header( title=f"{temp['Away'][0]} @ {temp['Home'][0]}", subtitle=f"Matchweek {temp.Wk[0]} {temp['Time'][0]} {str(temp['Date'][0]).split(' ')[0]}", ).tab_source_note(md(' ')) ) game_table_image_path = 'data/epl.png' game_table.save(game_table_image_path,window_size=(1000, 1000),web_driver=driver) with open(game_table_image_path, 'rb') as image_file: # Prepare the payload with the file payload = { 'file': image_file } # Send the POST request to the webhook response = requests.post(epl_sched_url, files=payload) os.remove(game_table_image_path)