Working version

2025-09-15 11:36:43 -04:00
parent 3d4afe7eab
commit 2003b9d115
10 changed files with 968 additions and 0 deletions
--- a/60
+++ b/60
@@ -0,0 +1,60 @@
+# Dockerfile
+FROM python:3.11-slim
+
+ENV PYTHONDONTWRITEBYTECODE=1 \
+    PYTHONUNBUFFERED=1 \
+    PIP_NO_CACHE_DIR=1 \
+    DEBIAN_FRONTEND=noninteractive \
+    TZ=America/New_York
+
+# System deps for Chromium/chromedriver + rendering & lxml
+RUN apt-get update && apt-get install -y --no-install-recommends \
+      chromium \
+      chromium-driver \
+      ca-certificates \
+      fonts-liberation \
+      fonts-dejavu \
+      libx11-6 \
+      libxcomposite1 \
+      libxdamage1 \
+      libxext6 \
+      libxfixes3 \
+      libxrandr2 \
+      libgbm1 \
+      libgtk-3-0 \
+      libnss3 \
+      libasound2 \
+      libatk-bridge2.0-0 \
+      libatk1.0-0 \
+      libcairo2 \
+      libpango-1.0-0 \
+      tzdata \
+      build-essential \
+      libxml2-dev \
+      libxslt1-dev \
+    && rm -rf /var/lib/apt/lists/*
+
+# Ensure Chromium is on a known path
+ENV CHROME_BIN=/usr/bin/chromium \
+    CHROMEDRIVER=/usr/bin/chromedriver
+
+WORKDIR /app
+
+# Copy deps first for better caching
+COPY requirements.txt .
+RUN pip install -r requirements.txt
+
+# Copy the app
+COPY run.py ./run.py
+COPY entrypoint.sh ./entrypoint.sh
+COPY src ./src
+COPY config ./config
+
+# Make sure data/ exists at runtime (also volume-mounted by compose)
+RUN mkdir -p /app/data && chmod +x /app/entrypoint.sh
+
+# Non-root user (optional)
+RUN useradd -ms /bin/bash appuser && chown -R appuser:appuser /app
+USER appuser
+
+ENTRYPOINT ["/app/entrypoint.sh"]
--- a/2
+++ b/2
@@ -0,0 +1,2 @@
+# Run epg-runner every 5 minutes
+*/5 * * * * docker compose run --rm epg-runner
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -0,0 +1,16 @@
+version: "3.8"
+services:
+  epg-runner:
+    build: .
+    image: epg-runner:latest
+    container_name: epg-runner
+    environment:
+      TZ: America/New_York
+      XML_URL: ""
+      MLB_SCHED_URL: ""
+      EPL_SCHED_URL: ""
+      UFC_SCHED_URL: ""
+    volumes:
+      - ./data:/app/data
+    # Don’t start automatically, only when scheduler calls it
+    restart: "no"
--- a/entrypoint.sh
+++ b/entrypoint.sh
@@ -0,0 +1,13 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# Create data dir if missing (also persisted via volume)
+mkdir -p /app/data
+
+# Helpful debug: show versions
+python --version
+echo "Chromium: $(chromium --version || true)"
+echo "Chromedriver: $(chromedriver --version || true)"
+
+# Run your task
+exec python /app/run.py
--- a/requirements.txt
+++ b/requirements.txt
@@ -0,0 +1,9 @@
+beautifulsoup4>=4.12
+pandas>=2.2
+lxml>=5.2
+requests>=2.32
+cloudscraper>=1.2
+pytz>=2024.1
+Pillow>=10.4
+great-tables>=0.10.0
+selenium>=4.25
--- a/run.py
+++ b/run.py
@@ -0,0 +1,50 @@
+# run.py
+## Download EPG ##
+import requests
+from pathlib import Path
+from datetime import datetime, timedelta 
+import os
+xml_url = os.getenv("XML_URL")
+import sys
+
+DATA_DIR = Path("data")
+DATA_DIR.mkdir(parents=True, exist_ok=True)
+local_path = DATA_DIR / "epg.xml"
+
+def download_epg():
+    try:
+        resp = requests.get(xml_url, timeout=30)
+        resp.raise_for_status()
+        local_path.write_bytes(resp.content)
+        print(f"EPG saved to {local_path}")
+    except Exception as e:
+        print(f"Failed to download EPG: {e}", file=sys.stderr)
+        # If EPG is required for later steps, you may want to sys.exit(1)
+
+download_epg()
+
+def run(mod):
+    print(mod.upper())
+    rc = os.system(f"python -m src.{mod}")
+    if rc != 0:
+        print(f"ERROR in {mod} (exit {rc})")
+
+for mod in ["mlb", "epl", "ufc"]:
+    run(mod)
+
+##################
+##### SLEEPY #####
+##################
+
+print(datetime.now())
+now = datetime.now()
+manana0 = datetime.now()+timedelta(days=1)
+manana = datetime(int(manana0.year),int(manana0.month),int(manana0.day),7,30)
+print('sleeping until '+str(manana))
+print(((manana-now).days*24*60*60)+((manana-now).seconds))
+time.sleep(((manana-now).days*24*60*60)+((manana-now).seconds))
+#del game_url
+#del main_soup
+
+print(datetime.now())
+
--- a/src/init.py
+++ b/src/init.py
--- a/src/epl.py
+++ b/src/epl.py
@@ -0,0 +1,191 @@
+from bs4 import BeautifulSoup
+import pandas as pd
+import re
+from datetime import date
+from lxml import etree
+import pandas as pd
+import re
+from pathlib import Path
+from datetime import datetime, date, time, timedelta, timezone
+from zoneinfo import ZoneInfo  # Python 3.9+
+from great_tables import GT, md
+import os
+mlb_sched_url = os.getenv("EPL_SCHED_URL")
+import requests 
+import cloudscraper
+driver = 'chrome'
+
+
+import pandas as pd
+from bs4 import BeautifulSoup
+import pytz
+
+# Load the HTML file
+headers = {
+    'Access-Control-Allow-Origin': '*',
+    'Access-Control-Allow-Methods': 'GET',
+    'Access-Control-Allow-Headers': 'Content-Type',
+    'Access-Control-Max-Age': '3600',
+    'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0'
+    }
+
+url = f'https://fbref.com/en/comps/9/schedule/Premier-League-Scores-and-Fixtures'
+# Fetch the webpage content
+scraper = cloudscraper.create_scraper()
+
+response = scraper.get(url)
+
+# Parse the HTML content using BeautifulSoup
+soup = BeautifulSoup(response.content, 'html.parser')
+
+# Extract tables
+tables = pd.read_html(response.text)
+soccer_schedule = tables[0]
+
+# --- Clean up Date/Time ---
+# Combine Date + Time
+soccer_schedule["DateTime_UK"] = pd.to_datetime(
+    soccer_schedule["Date"].astype(str) + " " + soccer_schedule["Time"].astype(str),
+    errors="coerce"
+)
+
+# Localize to UK time
+uk = pytz.timezone("Europe/London")
+ny = pytz.timezone("America/New_York")
+soccer_schedule["DateTime_UK"] = soccer_schedule["DateTime_UK"].dt.tz_localize(uk, ambiguous="NaT", nonexistent="NaT")
+
+# Convert to New York time
+soccer_schedule["DateTime_NY"] = soccer_schedule["DateTime_UK"].dt.tz_convert(ny)
+
+# Format for display in am/pm style
+soccer_schedule["NY_Time"] = soccer_schedule["DateTime_NY"].dt.strftime("%I:%M %p")
+
+# Drop leading zeros from hour (optional)
+soccer_schedule["NY_Time"] = soccer_schedule["NY_Time"].str.lstrip("0")
+
+# Show final
+soccer_schedule = soccer_schedule.replace("Nott'ham Forest","Nottingham Forest")
+
+
+import pandas as pd
+import xml.etree.ElementTree as ET
+from datetime import date
+
+# Fix team naming
+soccer_schedule = soccer_schedule.replace("Nott'ham Forest", "Nottingham Forest")
+soccer_schedule = soccer_schedule.replace("Newcastle Utd", "Newcastle United")
+soccer_schedule = soccer_schedule.replace("Wolves", "Wolverhampton Wanderers")
+
+# Filter today's schedule & keep merge_key
+df_today = (
+    soccer_schedule[
+        pd.to_datetime(soccer_schedule.Date) == pd.to_datetime(date.today())
+    ]
+    .reset_index()
+    .rename({'index': 'merge_key'}, axis='columns')
+)
+
+df_today = df_today[['Date', 'NY_Time', 'Away', 'Home', 'Wk', 'merge_key']]
+
+if len(df_today)>0:
+    # Load XML
+    file_path = "data/epg.xml"   # replace with actual path
+    tree = ET.parse(file_path)
+    root = tree.getroot()
+
+    data = []
+
+    # Iterate schedule rows
+    for _, row in df_today.iterrows():
+        home = str(row["Home"]).strip()
+        away = str(row["Away"]).strip()
+
+        for prog in root.findall("programme"):
+            title = prog.find("title").text if prog.find("title") is not None else ""
+            desc = prog.find("desc").text if prog.find("desc") is not None else ""
+
+            # Keep only if Premier League and both team names appear
+            if "Premier League" in title and home in title and away in title:
+                data.append({
+                    "merge_key": row["merge_key"],   # ✅ carry over merge_key
+                    "Schedule_Home": home,
+                    "Schedule_Away": away,
+                    "Start": prog.attrib.get("start"),
+                    "Stop": prog.attrib.get("stop"),
+                    "Channel": prog.attrib.get("channel"),
+                    "Title": title.strip()            })
+
+    # Create DataFrame
+    df_matches = pd.DataFrame(data)
+
+    # Convert start/stop to datetime
+    df_matches["Start"] = pd.to_datetime(df_matches["Start"].str[:14], format="%Y%m%d%H%M%S", errors="coerce", utc=True)
+    df_matches["Stop"] = pd.to_datetime(df_matches["Stop"].str[:14], format="%Y%m%d%H%M%S", errors="coerce", utc=True)
+
+    df = pd.merge(
+        df_today,
+        df_matches[['merge_key','Schedule_Home','Schedule_Away','Channel']].drop_duplicates(),
+        how='left',
+        on='merge_key'
+    )
+
+
+    ####
+
+    for game in df['merge_key'].unique():
+        temp = df[df['merge_key']==game].drop_duplicates().reset_index(drop=True)
+        temp = temp.rename({'NY_Time':'Time'},axis='columns')
+        temp['Wk'] = temp.Wk.astype('int')
+
+        temp2 = temp[['Channel']].drop_duplicates().reset_index(drop=True)
+
+        # Split once on "." into two columns
+        split_cols = temp2['Channel'].str.split('.', n=1, expand=True)
+        temp2['Channel'] = split_cols[0]                      # part before "."
+        temp2['Country'] = split_cols[1].str.upper().fillna("")  # part after ".", or "" if missing
+
+        # Reorder
+        temp2 = temp2[['Country', 'Channel']]
+
+        # Regex pattern: 'FOX' + 4 uppercase letters (total length 7, all caps)
+        temp2['Country'] = temp2['Country'].replace('US2','US')
+        pattern = re.compile(r"^NBC[A-Z]{4}$")
+        # Replace matching channels with 'FOX'
+        temp2["Channel"] = temp2["Channel"].apply(lambda x: "NBC" if pattern.match(x) else x)
+        pattern = re.compile(r"^NBC[A-Z]{5}$")
+        # Replace matching channels with 'FOX'
+        temp2["Channel"] = temp2["Channel"].apply(lambda x: "NBC" if pattern.match(x) else x)
+        pattern = re.compile(r"^NBC[A-Z]{6}$")
+        # Replace matching channels with 'FOX'
+        temp2["Channel"] = temp2["Channel"].apply(lambda x: "NBC" if pattern.match(x) else x)
+        pattern = re.compile(r"^NBC[A-Z]{3}$")
+        # Replace matching channels with 'FOX'
+        temp2["Channel"] = temp2["Channel"].apply(lambda x: "NBC" if pattern.match(x) else x)
+
+
+        temp2 = temp2.drop_duplicates()
+        ###
+
+        game_table = (
+        GT(temp2.drop_duplicates())
+        .tab_header(
+            title=f"{temp['Away'][0]} @ {temp['Home'][0]}",
+            subtitle=f"Matchweek {temp.Wk[0]} {temp['Time'][0]} {str(temp['Date'][0]).split(' ')[0]}",
+        ).tab_source_note(md(' '))
+        )
+        game_table_image_path = 'data/epl.png'
+        game_table.save(game_table_image_path,window_size=(1000, 1000),web_driver=driver)
+
+        with open(game_table_image_path, 'rb') as image_file:
+            # Prepare the payload with the file
+            payload = {
+                'file': image_file
+            }
+            
+            # Send the POST request to the webhook
+            response = requests.post(epl_sched_url, files=payload)
+
+
+        os.remove(game_table_image_path)
+
+
--- a/src/mlb.py
+++ b/src/mlb.py
@@ -0,0 +1,333 @@
+from bs4 import BeautifulSoup
+import pandas as pd
+import re
+from datetime import date
+from lxml import etree
+import pandas as pd
+import re
+from pathlib import Path
+from datetime import datetime, date, time, timedelta, timezone
+from zoneinfo import ZoneInfo  # Python 3.9+
+from great_tables import GT, md
+import os
+mlb_sched_url = os.getenv("MLB_SCHED_URL")
+import requests 
+import cloudscraper
+driver = 'chrome'
+
+year = date.today().year
+# Load the HTML file
+headers = {
+    'Access-Control-Allow-Origin': '*',
+    'Access-Control-Allow-Methods': 'GET',
+    'Access-Control-Allow-Headers': 'Content-Type',
+    'Access-Control-Max-Age': '3600',
+    'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0'
+    }
+
+url = f'https://www.baseball-reference.com/leagues/majors/{year}-schedule.shtml'
+# Fetch the webpage content
+scraper = cloudscraper.create_scraper()
+
+response = scraper.get(url)
+
+# Parse the HTML content using BeautifulSoup
+soup = BeautifulSoup(response.content, 'html.parser')
+
+
+today_section = soup.find("span", id="today")
+
+data = []
+if today_section:
+    for game in today_section.find_all_next("p", class_="game"):
+        gm_time = game.find("strong").get_text(strip=True) if game.find("strong") else None
+        teams = game.find_all("a")
+        if len(teams) >= 2:
+            away_team = teams[0].get_text(strip=True)
+            home_team = teams[1].get_text(strip=True)
+        else:
+            away_team = home_team = None
+
+        preview_tag = game.find("em")
+        preview_link = preview_tag.a["href"] if preview_tag and preview_tag.a else None
+
+        # Extract date from preview link if present
+        game_date = None
+        if preview_link:
+            game_date = pd.to_datetime(preview_link.split('/')[3][3:11])
+
+        data.append({
+            "Time": gm_time,
+            "Away Team": away_team,
+            "Home Team": home_team,
+            "Preview Link": preview_link,
+            "Game Date": game_date
+        })
+
+sched = pd.DataFrame(data)
+sched = sched[sched['Game Date']==str(date.today())]
+
+if len(sched)>0:
+
+    sched['Away Team'] = sched['Away Team'].replace("Arizona D'Backs","Arizona Diamondbacks")
+    sched['Home Team'] = sched['Home Team'].replace("Arizona D'Backs","Arizona Diamondbacks")
+
+
+    ##############
+
+    teams = list(pd.concat([sched['Away Team'], sched['Home Team'] ]))
+
+    ##############
+
+
+
+
+    # --- point this to your file ---
+    xml_path = Path("data/epg.xml")
+
+    # Parse with lxml recovery; wrap to guarantee a single root
+    parser = etree.XMLParser(recover=True, encoding="utf-8")
+    tree = etree.fromstring(b"<root>" + xml_path.read_bytes() + b"</root>", parser)
+
+    # Filters
+    #teams = ["Philadelphia Phillies", "Boston Red Sox"]
+
+    # Regex for filtering (any team)
+    pattern = re.compile("|".join(map(re.escape, teams)), re.IGNORECASE)
+
+    # Helper: find which team(s) appear in title, return a list (no duplicates, canonical casing)
+    def find_teams_list(title: str, teams_list):
+        found = []
+        for t in teams_list:
+            if re.search(re.escape(t), title, re.IGNORECASE):
+                found.append(t)
+        # Ensure max 2 and stable order as in teams_list
+        return found[:2]
+
+    # Collect rows
+    rows = []
+    for prog in tree.findall(".//programme"):
+        title_el = prog.find("title")
+        title = (title_el.text or "").strip() if title_el is not None else ""
+        if not pattern.search(title):
+            continue
+
+        teams_found_list = find_teams_list(title, teams)
+        if not teams_found_list:  # safety
+            continue
+
+        start_raw = prog.get("start") or ""  # e.g. "20250910221000 +0000"
+        channel = prog.get("channel")
+
+        # Parse start as UTC (first 14 chars = YYYYMMDDHHMMSS)
+        try:
+            dt_utc = datetime.strptime(start_raw[:14], "%Y%m%d%H%M%S").replace(tzinfo=timezone.utc)
+        except Exception:
+            continue
+
+        rows.append({
+            "title": title,
+            "start": start_raw,
+            "channel": channel,
+            "start_dt_utc": dt_utc,
+            # temporarily store full list; we'll split later
+            "teams_found_list": teams_found_list
+        })
+
+    tv = pd.DataFrame(rows)
+
+    if tv.empty:
+        print(tv)
+    else:
+        # Convert to local time (America/New_York)
+        local_tz = ZoneInfo("America/New_York")
+        tv["start_dt_local"] = tv["start_dt_utc"].dt.tz_convert(local_tz)
+
+        # Compute today's local window
+        today_local = datetime.now(local_tz).date()
+        start_of_day = datetime.combine(today_local, time.min, tzinfo=local_tz)
+        end_of_day = start_of_day + timedelta(days=1)
+
+        # Filter to events whose local start falls on "today"
+        mask = (tv["start_dt_local"] >= start_of_day) & (tv["start_dt_local"] < end_of_day)
+        df_today = tv.loc[mask, ["title", "start", "channel", "teams_found_list"]].reset_index(drop=True)
+
+        # ---- split start into local_date/local_time columns ----
+        def split_start_to_local_date_time(start_str: str, tz_str: str = "America/New_York"):
+            if not start_str:
+                return None, None
+            dt_utc = datetime.strptime(start_str[:14], "%Y%m%d%H%M%S").replace(tzinfo=timezone.utc)
+            local_dt = dt_utc.astimezone(ZoneInfo(tz_str))
+            return local_dt.date(), local_dt.time()
+
+        df_today[["local_date", "local_time"]] = df_today["start"].apply(
+            lambda s: pd.Series(split_start_to_local_date_time(s))
+        )
+
+        # ---- create team_found_1 and team_found_2 columns ----
+        def to_two_cols(lst):
+            # pad to length 2 with None
+            padded = (lst + [None, None])[:2]
+            return pd.Series(padded, index=["team_found_1", "team_found_2"])
+
+        df_today[["team_found_1", "team_found_2"]] = df_today["teams_found_list"].apply(to_two_cols)
+        df_today = df_today.drop(columns=["teams_found_list"])
+
+        # reorder columns if you like
+        df_today = df_today[["title", "channel", "start", "local_date", "local_time", "team_found_1", "team_found_2"]]
+
+        # Optional: df_today.to_csv("filtered_programmes_today.csv", index=False)
+
+
+    #####
+
+
+    ########
+    df = pd.merge(
+        sched,
+        df_today[[
+        'channel',
+        'local_time',
+        'team_found_1'
+        ]].drop_duplicates().rename({
+            'team_found_1':'Away Team'
+        },axis='columns'),
+        how='left',
+        on='Away Team'
+
+    )
+
+    #########
+    df = df.copy()  # if your df is named something else, replace here
+
+    # 1) Build the "game start" local datetime from Game Date + Time (e.g., "2025-09-10" + "2:35 pm")
+    df["Game Date"] = pd.to_datetime(df["Game Date"], errors="coerce")
+    df["Time"] = df["Time"].astype(str).str.strip()
+
+    game_dt = pd.to_datetime(
+        df["Game Date"].dt.strftime("%Y-%m-%d") + " " + df["Time"],
+        format="%Y-%m-%d %I:%M %p",
+        errors="coerce"
+    )
+
+    # 2) Parse away/home local times (strings like "14:30:00") and combine with Game Date
+    def combine_date_and_hms(date_series, hms_series):
+        # parse HH:MM:SS (coerce invalid to NaT)
+        parsed = pd.to_datetime(hms_series.astype(str).str.strip(), format="%H:%M:%S", errors="coerce")
+        # keep only time-of-day; combine with date
+        return pd.to_datetime(date_series.dt.strftime("%Y-%m-%d") + " " + parsed.dt.strftime("%H:%M:%S"), errors="coerce")
+
+    away_dt = combine_date_and_hms(df["Game Date"], df["local_time"])
+
+    # 3) Filter: local_time_Away > Time OR local_time_Home > Time
+    mask = (away_dt >= game_dt - timedelta(hours=1)) & (away_dt <= game_dt + timedelta(hours=4))
+    filtered = df.loc[mask].copy()
+
+    df = pd.merge(
+        sched,
+        df_today[[
+        'channel',
+        'local_time',
+        'team_found_2'
+        ]].drop_duplicates().rename({
+            'team_found_2':'Home Team'
+        },axis='columns'),
+        how='left',
+        on='Home Team',
+        suffixes=['_Away','_Home']
+
+    )
+
+    ######
+    df = df.copy()  # if your df is named something else, replace here
+
+    # 1) Build the "game start" local datetime from Game Date + Time (e.g., "2025-09-10" + "2:35 pm")
+    df["Game Date"] = pd.to_datetime(df["Game Date"], errors="coerce")
+    df["Time"] = df["Time"].astype(str).str.strip()
+
+    game_dt = pd.to_datetime(
+        df["Game Date"].dt.strftime("%Y-%m-%d") + " " + df["Time"],
+        format="%Y-%m-%d %I:%M %p",
+        errors="coerce"
+    )
+
+    # 2) Parse away/home local times (strings like "14:30:00") and combine with Game Date
+    def combine_date_and_hms(date_series, hms_series):
+        # parse HH:MM:SS (coerce invalid to NaT)
+        parsed = pd.to_datetime(hms_series.astype(str).str.strip(), format="%H:%M:%S", errors="coerce")
+        # keep only time-of-day; combine with date
+        return pd.to_datetime(date_series.dt.strftime("%Y-%m-%d") + " " + parsed.dt.strftime("%H:%M:%S"), errors="coerce")
+
+    home_dt = combine_date_and_hms(df["Game Date"], df["local_time"])
+
+    # 3) Filter: local_time_Away > Time OR local_time_Home > Time
+    mask = (home_dt >= game_dt - timedelta(hours=1)) & (home_dt <= game_dt + timedelta(hours=4))
+    filtered2 = df.loc[mask].copy()
+    ######
+
+
+    df = pd.merge(
+        filtered,
+        filtered2,
+        how='left',
+        on=['Time','Away Team','Home Team','Preview Link','Game Date'],
+        suffixes=['_Home','_Away']
+    )
+
+    #####
+
+    for game in df['Preview Link'].unique():
+        temp = df[df['Preview Link']==game].drop_duplicates().reset_index(drop=True)
+
+        temp2 = pd.DataFrame({
+            'Channel':list(temp['channel_Home'].dropna())+list(temp['channel_Away'].dropna())
+        })
+        temp2 = temp2.drop_duplicates().reset_index(drop=True)
+        temp2['Country'] = [x.split('.')[1].upper() for x in temp2.Channel]
+        temp2['Channel'] = [x.split('.')[0] for x in temp2.Channel]
+        temp2 = temp2[['Country','Channel']]
+
+        # Regex pattern: 'FOX' + 4 uppercase letters (total length 7, all caps)
+        pattern = re.compile(r"^FOX[A-Z]{4}$")
+
+        # Replace matching channels with 'FOX'
+        temp2['Country'] = temp2['Country'].replace('US2','US')
+        temp2["Channel"] = temp2["Channel"].apply(lambda x: "FOX" if pattern.match(x) else x)
+        temp2 = temp2.drop_duplicates()
+
+
+        game_table = (
+        GT(temp2.drop_duplicates())
+        .tab_header(
+            title=f"{temp['Away Team'][0]} @ {temp['Home Team'][0]}",
+            subtitle=f"{temp['Time'][0]} {str(temp['Game Date'][0]).split(' ')[0]}",
+        ).tab_source_note(md(' '))
+        )
+        game_table_image_path = 'data/mlb.png'
+        game_table.save(game_table_image_path,window_size=(1000, 1000),web_driver=driver)
+
+        with open(game_table_image_path, 'rb') as image_file:
+            # Prepare the payload with the file
+            payload = {
+                'file': image_file
+            }
+            
+            # Send the POST request to the webhook
+            response = requests.post(mlb_sched_url, files=payload)
+
+
+        os.remove(game_table_image_path)
+
+
+
+
+
+
+
+
+
+
+
+
+
+
--- a/src/ufc.py
+++ b/src/ufc.py
@@ -0,0 +1,294 @@
+from bs4 import BeautifulSoup
+import pandas as pd
+import re
+from datetime import date
+from lxml import etree
+import pandas as pd
+import re
+from pathlib import Path
+from datetime import datetime, date, time, timedelta, timezone
+from zoneinfo import ZoneInfo  # Python 3.9+
+from great_tables import GT, md
+import os
+mlb_sched_url = os.getenv("UFC_SCHED_URL")
+import requests 
+import cloudscraper
+driver = 'chrome'
+
+#########
+
+from bs4 import BeautifulSoup, Tag
+import pandas as pd
+import os
+import re
+
+# --- load the HTML ---
+# Load the HTML file
+headers = {
+    'Access-Control-Allow-Origin': '*',
+    'Access-Control-Allow-Methods': 'GET',
+    'Access-Control-Allow-Headers': 'Content-Type',
+    'Access-Control-Max-Age': '3600',
+    'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0'
+    }
+
+url = f'https://www.mmafighting.com/schedule/ufc'
+# Fetch the webpage content
+scraper = cloudscraper.create_scraper()
+
+response = scraper.get(url)
+
+# Parse the HTML content using BeautifulSoup
+soup = BeautifulSoup(response.content, 'html.parser')
+
+events, dates, card_types, fights = [], [], [], []
+main_starts, prelim_starts = [], []
+
+def get_card_type(li: Tag) -> str:
+    """Find the nearest split section header (Main Card / Undercard)."""
+    split = li.find_parent("div", class_="m-mmaf-pte-event-list__split-item")
+    if split:
+        h4 = split.find("h4")
+        if h4:
+            return h4.get_text(strip=True)
+    return "Main Card"  # default for top-level <ul> blocks
+
+_time_pat = r"([0-9]{1,2}(?::[0-9]{2})?\s*[ap]\.m\.)"
+
+def normalize_time(t: str | None) -> str | None:
+    if not t:
+        return None
+    s = t.lower()
+    s = re.sub(r"\s*a\.m\.", "am", s)
+    s = re.sub(r"\s*p\.m\.", "pm", s)
+    s = s.replace(" ", "")
+    return s  # e.g., "6pm", "7:30pm"
+
+def extract_event_times(section_nodes: list[Tag]) -> tuple[str | None, str | None]:
+    """Look for the tv-info <p> within this event's own section only."""
+    tv_text = None
+    for n in section_nodes:
+        if isinstance(n, Tag) and n.name == "p" and "m-mmaf-pte-event-list__tv-info" in (n.get("class") or []):
+            tv_text = n.get_text(" ", strip=True).lower()
+            break
+    if not tv_text:
+        return None, None
+
+    main_m = re.search(r"main\s*card[^;]*?\bat\s*" + _time_pat, tv_text)
+    prelim_m = re.search(r"prelims?[^;]*?\bat\s*" + _time_pat, tv_text)
+    main_time = normalize_time(main_m.group(1)) if main_m else None
+    prelim_time = normalize_time(prelim_m.group(1)) if prelim_m else None
+    return main_time, prelim_time
+
+# The schedule list wrapper(s)
+wrappers = soup.find_all("div", class_="m-mmaf-pte-event-list")
+for wrapper in wrappers:
+    # iterate each event header inside the wrapper
+    for h2 in wrapper.find_all("h2"):
+        event_name = h2.get_text(strip=True)
+
+        # the date for this event is the next sibling <h3>
+        date_tag = h2.find_next_sibling("h3")
+        event_date = date_tag.get_text(strip=True) if date_tag else ""
+
+        # gather siblings after date_tag (or h2 if no date) until the next <h2>
+        start_from = date_tag if date_tag else h2
+        section_nodes = []
+        for sib in start_from.next_siblings:
+            if isinstance(sib, Tag) and sib.name == "h2":
+                break
+            section_nodes.append(sib)
+
+        # extract TV times **for this event only**
+        main_time, prelim_time = extract_event_times(section_nodes)
+
+        # within this event’s section, collect every <li> fight
+        for node in section_nodes:
+            if not isinstance(node, Tag):
+                continue
+            for li in node.find_all("li"):
+                # prefer the anchor text; fallback to full li text
+                a = li.find("a")
+                fight_text = a.get_text(" ", strip=True) if a else li.get_text(" ", strip=True)
+                # strip any trailing labels like "Title Fight"
+                fight_text = fight_text.replace("Title Fight", "").strip()
+
+                events.append(event_name)
+                dates.append(event_date)
+                card_types.append(get_card_type(li))
+                fights.append(fight_text)
+                main_starts.append(main_time)
+                prelim_starts.append(prelim_time)
+
+# build dataframe
+sched = pd.DataFrame({
+    "Event": events,
+    "Date": dates,
+    "Card Type": card_types,
+    "Fight": fights,
+    "main_start": main_starts,
+    "prelim_start": prelim_starts
+})
+
+# Example filter: keep only Sept 13, 2025 (remove or adjust as needed)
+sched = sched[
+    (pd.to_datetime(sched.Date) == pd.to_datetime(date.today()))
+].reset_index(drop=True)
+
+if len(sched)>0:
+
+    # group fights by consecutive card type (keep first two groups)
+    event=[0]
+    j=0
+    for i in range(1,len(sched)):
+        if sched['Card Type'][i] == sched['Card Type'][i-1]:
+            event.append(j)
+        else:
+            j += 1
+            event.append(j)
+    sched['i'] = event
+    sched = sched[sched['i'] < 2].drop(['i'], axis='columns')
+
+    ###########
+
+    import xml.etree.ElementTree as ET
+    import unicodedata
+    import re
+
+    # --- normalization helpers ---
+    def normalize(text: str) -> str:
+        if not text:
+            return ""
+        text = text.lower()
+        text = unicodedata.normalize("NFKD", text)
+        text = "".join(c for c in text if not unicodedata.combining(c))
+        text = text.replace("vs.", "x")
+        text = re.sub(r"[^a-z0-9\s]", " ", text)  # keep alphanum only
+        text = re.sub(r"\s+", " ", text).strip()
+        return text
+
+    def tokenize_event(event_title: str):
+        """Return prefix tokens and headline tokens separately."""
+        parts = [p.strip() for p in event_title.split(":")]
+        if len(parts) == 2:
+            prefix = normalize(parts[0])
+            headline = normalize(parts[1])
+        else:
+            prefix, headline = "", normalize(event_title)
+        prefix_tokens = set(prefix.split()) - {"ufc", "fight", "night", "new"}
+        headline_tokens = set(headline.split()) - {"ufc", "fight", "night", "new"}
+        return prefix_tokens, headline_tokens
+
+    # --- example schedule event ---
+    event_title = "Noche UFC: Lopes vs. Silva"
+    prefix_tokens, headline_tokens = tokenize_event(event_title)
+
+    # --- parse XML ---
+    xml_file = "data/epg.xml"
+    tree = ET.parse(xml_file)
+    root = tree.getroot()
+
+    matches = []
+    for prog in root.findall("programme"):
+        title_el = prog.find("title")
+        desc_el = prog.find("desc")
+
+        title_text = title_el.text if (title_el is not None and title_el.text) else ""
+        desc_text = desc_el.text if (desc_el is not None and desc_el.text) else ""
+
+        full_text = normalize(title_text + " " + desc_text)
+
+        # check: require all headline tokens present in full_text
+        if all(tok in full_text for tok in headline_tokens):
+            matches.append({
+                "event": event_title,
+                "programme_title": title_text.strip(),
+                "desc": desc_text.strip(),
+                "start": prog.attrib.get("start"),
+                "stop": prog.attrib.get("stop"),
+                "channel": prog.attrib.get("channel")
+            })
+
+    ################
+
+    tv = pd.DataFrame(matches)
+    tv = tv[['channel']].rename({'channel':'Channel'},axis='columns').drop_duplicates().reset_index(drop=True)
+    tv['Country'] = [x.split('.')[1].upper() for x in tv.Channel]
+    tv['Channel'] = [x.split('.')[0] for x in tv.Channel]
+    tv['Country'] = tv['Country'].astype(str).str.replace('1', '', regex=False)
+
+    ################
+
+    from PIL import Image
+    game_table_image_path = "data/ufc.png"
+    game_table_image_path0 = "data/ufc_sched.png"
+    game_table_image_path1 = "data/ufc_tv.png"
+
+    # Save UFC schedule table
+    temp = sched[['Card Type','Fight']]
+    temp['Card Type'] = temp['Card Type'].mask(temp['Card Type'].duplicated(), '')
+
+    game_table = (
+        GT(temp)
+        .tab_header(
+            title=f"{sched['Event'][0]}",
+            subtitle=md(
+                f"{sched['Date'][0]}  \nPrelims: {str(sched['prelim_start'][0]).split(' ')[0]}  \nMain Event: {str(sched['main_start'][0]).split(' ')[0]}"
+            ),
+        )
+        .tab_source_note(md(' '))
+    )
+
+    game_table.save(game_table_image_path0, window_size=(1000,1000), web_driver=driver)
+
+    # Render TV table separately
+    tv = pd.DataFrame(matches)
+    tv = tv[['channel']].rename({'channel':'Channel'},axis='columns').drop_duplicates().reset_index(drop=True)
+    tv['Country'] = [x.split('.')[1].upper() for x in tv.Channel]
+    tv['Channel'] = [x.split('.')[0] for x in tv.Channel]
+    tv['Country'] = tv['Country'].astype(str).str.replace('1', '', regex=False)
+    tv_table = GT(tv).tab_header(title="Broadcast Channels")
+    tv_table.save(game_table_image_path1, window_size=(800,400), web_driver=driver)
+
+    # Combine images vertically
+    imgs = [Image.open(x) for x in [game_table_image_path0,game_table_image_path1]]
+    width = max(i.width for i in imgs)
+    height = sum(i.height for i in imgs)
+
+    combined = Image.new("RGB", (width, height), "white")
+    y_offset = 0
+    for im in imgs:
+        combined.paste(im, (0, y_offset))
+        y_offset += im.height
+
+    combined.save(game_table_image_path)
+
+    ########
+
+    with open(game_table_image_path, 'rb') as image_file:
+        # Prepare the payload with the file
+        payload = {
+            'file': image_file
+        }
+        
+        # Send the POST request to the webhook
+        response = requests.post(ufc_sched_url, files=payload)
+
+
+    os.remove(game_table_image_path)
+    os.remove(game_table_image_path0)
+    os.remove(game_table_image_path1)
+
+
+
+
+
+
+
+
+
+
+
+
+
+