discord-epg-alert/src/epl.py

from bs4 import BeautifulSoup
import pandas as pd
import re
from datetime import date
from lxml import etree
import pandas as pd
import re
from pathlib import Path
from datetime import datetime, date, time, timedelta, timezone
from zoneinfo import ZoneInfo  # Python 3.9+
from great_tables import GT, md
import os
mlb_sched_url = os.getenv("EPL_SCHED_URL")
import requests
import cloudscraper
driver = 'chrome'


import pandas as pd
from bs4 import BeautifulSoup
import pytz

# Load the HTML file
headers = {
    'Access-Control-Allow-Origin': '*',
    'Access-Control-Allow-Methods': 'GET',
    'Access-Control-Allow-Headers': 'Content-Type',
    'Access-Control-Max-Age': '3600',
    'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0'
    }

url = f'https://fbref.com/en/comps/9/schedule/Premier-League-Scores-and-Fixtures'
# Fetch the webpage content
scraper = cloudscraper.create_scraper()

response = scraper.get(url)

# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(response.content, 'html.parser')

# Extract tables
tables = pd.read_html(response.text)
soccer_schedule = tables[0]

# --- Clean up Date/Time ---
# Combine Date + Time
soccer_schedule["DateTime_UK"] = pd.to_datetime(
    soccer_schedule["Date"].astype(str) + " " + soccer_schedule["Time"].astype(str),
    errors="coerce"
)

# Localize to UK time
uk = pytz.timezone("Europe/London")
ny = pytz.timezone("America/New_York")
soccer_schedule["DateTime_UK"] = soccer_schedule["DateTime_UK"].dt.tz_localize(uk, ambiguous="NaT", nonexistent="NaT")

# Convert to New York time
soccer_schedule["DateTime_NY"] = soccer_schedule["DateTime_UK"].dt.tz_convert(ny)

# Format for display in am/pm style
soccer_schedule["NY_Time"] = soccer_schedule["DateTime_NY"].dt.strftime("%I:%M %p")

# Drop leading zeros from hour (optional)
soccer_schedule["NY_Time"] = soccer_schedule["NY_Time"].str.lstrip("0")

# Show final
soccer_schedule = soccer_schedule.replace("Nott'ham Forest","Nottingham Forest")


import pandas as pd
import xml.etree.ElementTree as ET
from datetime import date

# Fix team naming
soccer_schedule = soccer_schedule.replace("Nott'ham Forest", "Nottingham Forest")
soccer_schedule = soccer_schedule.replace("Newcastle Utd", "Newcastle United")
soccer_schedule = soccer_schedule.replace("Wolves", "Wolverhampton Wanderers")

# Filter today's schedule & keep merge_key
df_today = (
    soccer_schedule[
        pd.to_datetime(soccer_schedule.Date) == pd.to_datetime(date.today())
    ]
    .reset_index()
    .rename({'index': 'merge_key'}, axis='columns')
)

df_today = df_today[['Date', 'NY_Time', 'Away', 'Home', 'Wk', 'merge_key']]

if len(df_today)>0:
    # Load XML
    file_path = "data/epg.xml"   # replace with actual path
    tree = ET.parse(file_path)
    root = tree.getroot()

    data = []

    # Iterate schedule rows
    for _, row in df_today.iterrows():
        home = str(row["Home"]).strip()
        away = str(row["Away"]).strip()

        for prog in root.findall("programme"):
            title = prog.find("title").text if prog.find("title") is not None else ""
            desc = prog.find("desc").text if prog.find("desc") is not None else ""

            # Keep only if Premier League and both team names appear
            if "Premier League" in title and home in title and away in title:
                data.append({
                    "merge_key": row["merge_key"],   # ✅ carry over merge_key
                    "Schedule_Home": home,
                    "Schedule_Away": away,
                    "Start": prog.attrib.get("start"),
                    "Stop": prog.attrib.get("stop"),
                    "Channel": prog.attrib.get("channel"),
                    "Title": title.strip()            })

    # Create DataFrame
    df_matches = pd.DataFrame(data)

    # Convert start/stop to datetime
    df_matches["Start"] = pd.to_datetime(df_matches["Start"].str[:14], format="%Y%m%d%H%M%S", errors="coerce", utc=True)
    df_matches["Stop"] = pd.to_datetime(df_matches["Stop"].str[:14], format="%Y%m%d%H%M%S", errors="coerce", utc=True)

    df = pd.merge(
        df_today,
        df_matches[['merge_key','Schedule_Home','Schedule_Away','Channel']].drop_duplicates(),
        how='left',
        on='merge_key'
    )


    ####

    for game in df['merge_key'].unique():
        temp = df[df['merge_key']==game].drop_duplicates().reset_index(drop=True)
        temp = temp.rename({'NY_Time':'Time'},axis='columns')
        temp['Wk'] = temp.Wk.astype('int')

        temp2 = temp[['Channel']].drop_duplicates().reset_index(drop=True)

        # Split once on "." into two columns
        split_cols = temp2['Channel'].str.split('.', n=1, expand=True)
        temp2['Channel'] = split_cols[0]                      # part before "."
        temp2['Country'] = split_cols[1].str.upper().fillna("")  # part after ".", or "" if missing

        # Reorder
        temp2 = temp2[['Country', 'Channel']]

        # Regex pattern: 'FOX' + 4 uppercase letters (total length 7, all caps)
        temp2['Country'] = temp2['Country'].replace('US2','US')
        pattern = re.compile(r"^NBC[A-Z]{4}$")
        # Replace matching channels with 'FOX'
        temp2["Channel"] = temp2["Channel"].apply(lambda x: "NBC" if pattern.match(x) else x)
        pattern = re.compile(r"^NBC[A-Z]{5}$")
        # Replace matching channels with 'FOX'
        temp2["Channel"] = temp2["Channel"].apply(lambda x: "NBC" if pattern.match(x) else x)
        pattern = re.compile(r"^NBC[A-Z]{6}$")
        # Replace matching channels with 'FOX'
        temp2["Channel"] = temp2["Channel"].apply(lambda x: "NBC" if pattern.match(x) else x)
        pattern = re.compile(r"^NBC[A-Z]{3}$")
        # Replace matching channels with 'FOX'
        temp2["Channel"] = temp2["Channel"].apply(lambda x: "NBC" if pattern.match(x) else x)


        temp2 = temp2.drop_duplicates()
        ###

        game_table = (
        GT(temp2.drop_duplicates())
        .tab_header(
            title=f"{temp['Away'][0]} @ {temp['Home'][0]}",
            subtitle=f"Matchweek {temp.Wk[0]} {temp['Time'][0]} {str(temp['Date'][0]).split(' ')[0]}",
        ).tab_source_note(md(' '))
        )
        game_table_image_path = 'data/epl.png'
        game_table.save(game_table_image_path,window_size=(1000, 1000),web_driver=driver)

        with open(game_table_image_path, 'rb') as image_file:
            # Prepare the payload with the file
            payload = {
                'file': image_file
            }

            # Send the POST request to the webhook
            response = requests.post(epl_sched_url, files=payload)


        os.remove(game_table_image_path)