Working version

2025-09-15 11:36:43 -04:00
parent 3d4afe7eab
commit 2003b9d115
10 changed files with 968 additions and 0 deletions
--- a/src/epl.py
+++ b/src/epl.py
@@ -0,0 +1,191 @@
+from bs4 import BeautifulSoup
+import pandas as pd
+import re
+from datetime import date
+from lxml import etree
+import pandas as pd
+import re
+from pathlib import Path
+from datetime import datetime, date, time, timedelta, timezone
+from zoneinfo import ZoneInfo  # Python 3.9+
+from great_tables import GT, md
+import os
+mlb_sched_url = os.getenv("EPL_SCHED_URL")
+import requests 
+import cloudscraper
+driver = 'chrome'
+
+
+import pandas as pd
+from bs4 import BeautifulSoup
+import pytz
+
+# Load the HTML file
+headers = {
+    'Access-Control-Allow-Origin': '*',
+    'Access-Control-Allow-Methods': 'GET',
+    'Access-Control-Allow-Headers': 'Content-Type',
+    'Access-Control-Max-Age': '3600',
+    'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0'
+    }
+
+url = f'https://fbref.com/en/comps/9/schedule/Premier-League-Scores-and-Fixtures'
+# Fetch the webpage content
+scraper = cloudscraper.create_scraper()
+
+response = scraper.get(url)
+
+# Parse the HTML content using BeautifulSoup
+soup = BeautifulSoup(response.content, 'html.parser')
+
+# Extract tables
+tables = pd.read_html(response.text)
+soccer_schedule = tables[0]
+
+# --- Clean up Date/Time ---
+# Combine Date + Time
+soccer_schedule["DateTime_UK"] = pd.to_datetime(
+    soccer_schedule["Date"].astype(str) + " " + soccer_schedule["Time"].astype(str),
+    errors="coerce"
+)
+
+# Localize to UK time
+uk = pytz.timezone("Europe/London")
+ny = pytz.timezone("America/New_York")
+soccer_schedule["DateTime_UK"] = soccer_schedule["DateTime_UK"].dt.tz_localize(uk, ambiguous="NaT", nonexistent="NaT")
+
+# Convert to New York time
+soccer_schedule["DateTime_NY"] = soccer_schedule["DateTime_UK"].dt.tz_convert(ny)
+
+# Format for display in am/pm style
+soccer_schedule["NY_Time"] = soccer_schedule["DateTime_NY"].dt.strftime("%I:%M %p")
+
+# Drop leading zeros from hour (optional)
+soccer_schedule["NY_Time"] = soccer_schedule["NY_Time"].str.lstrip("0")
+
+# Show final
+soccer_schedule = soccer_schedule.replace("Nott'ham Forest","Nottingham Forest")
+
+
+import pandas as pd
+import xml.etree.ElementTree as ET
+from datetime import date
+
+# Fix team naming
+soccer_schedule = soccer_schedule.replace("Nott'ham Forest", "Nottingham Forest")
+soccer_schedule = soccer_schedule.replace("Newcastle Utd", "Newcastle United")
+soccer_schedule = soccer_schedule.replace("Wolves", "Wolverhampton Wanderers")
+
+# Filter today's schedule & keep merge_key
+df_today = (
+    soccer_schedule[
+        pd.to_datetime(soccer_schedule.Date) == pd.to_datetime(date.today())
+    ]
+    .reset_index()
+    .rename({'index': 'merge_key'}, axis='columns')
+)
+
+df_today = df_today[['Date', 'NY_Time', 'Away', 'Home', 'Wk', 'merge_key']]
+
+if len(df_today)>0:
+    # Load XML
+    file_path = "data/epg.xml"   # replace with actual path
+    tree = ET.parse(file_path)
+    root = tree.getroot()
+
+    data = []
+
+    # Iterate schedule rows
+    for _, row in df_today.iterrows():
+        home = str(row["Home"]).strip()
+        away = str(row["Away"]).strip()
+
+        for prog in root.findall("programme"):
+            title = prog.find("title").text if prog.find("title") is not None else ""
+            desc = prog.find("desc").text if prog.find("desc") is not None else ""
+
+            # Keep only if Premier League and both team names appear
+            if "Premier League" in title and home in title and away in title:
+                data.append({
+                    "merge_key": row["merge_key"],   # ✅ carry over merge_key
+                    "Schedule_Home": home,
+                    "Schedule_Away": away,
+                    "Start": prog.attrib.get("start"),
+                    "Stop": prog.attrib.get("stop"),
+                    "Channel": prog.attrib.get("channel"),
+                    "Title": title.strip()            })
+
+    # Create DataFrame
+    df_matches = pd.DataFrame(data)
+
+    # Convert start/stop to datetime
+    df_matches["Start"] = pd.to_datetime(df_matches["Start"].str[:14], format="%Y%m%d%H%M%S", errors="coerce", utc=True)
+    df_matches["Stop"] = pd.to_datetime(df_matches["Stop"].str[:14], format="%Y%m%d%H%M%S", errors="coerce", utc=True)
+
+    df = pd.merge(
+        df_today,
+        df_matches[['merge_key','Schedule_Home','Schedule_Away','Channel']].drop_duplicates(),
+        how='left',
+        on='merge_key'
+    )
+
+
+    ####
+
+    for game in df['merge_key'].unique():
+        temp = df[df['merge_key']==game].drop_duplicates().reset_index(drop=True)
+        temp = temp.rename({'NY_Time':'Time'},axis='columns')
+        temp['Wk'] = temp.Wk.astype('int')
+
+        temp2 = temp[['Channel']].drop_duplicates().reset_index(drop=True)
+
+        # Split once on "." into two columns
+        split_cols = temp2['Channel'].str.split('.', n=1, expand=True)
+        temp2['Channel'] = split_cols[0]                      # part before "."
+        temp2['Country'] = split_cols[1].str.upper().fillna("")  # part after ".", or "" if missing
+
+        # Reorder
+        temp2 = temp2[['Country', 'Channel']]
+
+        # Regex pattern: 'FOX' + 4 uppercase letters (total length 7, all caps)
+        temp2['Country'] = temp2['Country'].replace('US2','US')
+        pattern = re.compile(r"^NBC[A-Z]{4}$")
+        # Replace matching channels with 'FOX'
+        temp2["Channel"] = temp2["Channel"].apply(lambda x: "NBC" if pattern.match(x) else x)
+        pattern = re.compile(r"^NBC[A-Z]{5}$")
+        # Replace matching channels with 'FOX'
+        temp2["Channel"] = temp2["Channel"].apply(lambda x: "NBC" if pattern.match(x) else x)
+        pattern = re.compile(r"^NBC[A-Z]{6}$")
+        # Replace matching channels with 'FOX'
+        temp2["Channel"] = temp2["Channel"].apply(lambda x: "NBC" if pattern.match(x) else x)
+        pattern = re.compile(r"^NBC[A-Z]{3}$")
+        # Replace matching channels with 'FOX'
+        temp2["Channel"] = temp2["Channel"].apply(lambda x: "NBC" if pattern.match(x) else x)
+
+
+        temp2 = temp2.drop_duplicates()
+        ###
+
+        game_table = (
+        GT(temp2.drop_duplicates())
+        .tab_header(
+            title=f"{temp['Away'][0]} @ {temp['Home'][0]}",
+            subtitle=f"Matchweek {temp.Wk[0]} {temp['Time'][0]} {str(temp['Date'][0]).split(' ')[0]}",
+        ).tab_source_note(md(' '))
+        )
+        game_table_image_path = 'data/epl.png'
+        game_table.save(game_table_image_path,window_size=(1000, 1000),web_driver=driver)
+
+        with open(game_table_image_path, 'rb') as image_file:
+            # Prepare the payload with the file
+            payload = {
+                'file': image_file
+            }
+            
+            # Send the POST request to the webhook
+            response = requests.post(epl_sched_url, files=payload)
+
+
+        os.remove(game_table_image_path)
+
+