Working version

This commit is contained in:
2025-09-15 11:36:43 -04:00
parent 3d4afe7eab
commit 2003b9d115
10 changed files with 968 additions and 0 deletions

60
Dockerfile Normal file
View File

@@ -0,0 +1,60 @@
# Dockerfile
FROM python:3.11-slim
ENV PYTHONDONTWRITEBYTECODE=1 \
PYTHONUNBUFFERED=1 \
PIP_NO_CACHE_DIR=1 \
DEBIAN_FRONTEND=noninteractive \
TZ=America/New_York
# System deps for Chromium/chromedriver + rendering & lxml
RUN apt-get update && apt-get install -y --no-install-recommends \
chromium \
chromium-driver \
ca-certificates \
fonts-liberation \
fonts-dejavu \
libx11-6 \
libxcomposite1 \
libxdamage1 \
libxext6 \
libxfixes3 \
libxrandr2 \
libgbm1 \
libgtk-3-0 \
libnss3 \
libasound2 \
libatk-bridge2.0-0 \
libatk1.0-0 \
libcairo2 \
libpango-1.0-0 \
tzdata \
build-essential \
libxml2-dev \
libxslt1-dev \
&& rm -rf /var/lib/apt/lists/*
# Ensure Chromium is on a known path
ENV CHROME_BIN=/usr/bin/chromium \
CHROMEDRIVER=/usr/bin/chromedriver
WORKDIR /app
# Copy deps first for better caching
COPY requirements.txt .
RUN pip install -r requirements.txt
# Copy the app
COPY run.py ./run.py
COPY entrypoint.sh ./entrypoint.sh
COPY src ./src
COPY config ./config
# Make sure data/ exists at runtime (also volume-mounted by compose)
RUN mkdir -p /app/data && chmod +x /app/entrypoint.sh
# Non-root user (optional)
RUN useradd -ms /bin/bash appuser && chown -R appuser:appuser /app
USER appuser
ENTRYPOINT ["/app/entrypoint.sh"]

2
cronfile Normal file
View File

@@ -0,0 +1,2 @@
# Run epg-runner every 5 minutes
*/5 * * * * docker compose run --rm epg-runner

16
docker-compose.yml Normal file
View File

@@ -0,0 +1,16 @@
version: "3.8"
services:
epg-runner:
build: .
image: epg-runner:latest
container_name: epg-runner
environment:
TZ: America/New_York
XML_URL: ""
MLB_SCHED_URL: ""
EPL_SCHED_URL: ""
UFC_SCHED_URL: ""
volumes:
- ./data:/app/data
# Dont start automatically, only when scheduler calls it
restart: "no"

13
entrypoint.sh Normal file
View File

@@ -0,0 +1,13 @@
#!/usr/bin/env bash
set -euo pipefail
# Create data dir if missing (also persisted via volume)
mkdir -p /app/data
# Helpful debug: show versions
python --version
echo "Chromium: $(chromium --version || true)"
echo "Chromedriver: $(chromedriver --version || true)"
# Run your task
exec python /app/run.py

9
requirements.txt Normal file
View File

@@ -0,0 +1,9 @@
beautifulsoup4>=4.12
pandas>=2.2
lxml>=5.2
requests>=2.32
cloudscraper>=1.2
pytz>=2024.1
Pillow>=10.4
great-tables>=0.10.0
selenium>=4.25

50
run.py Normal file
View File

@@ -0,0 +1,50 @@
# run.py
## Download EPG ##
import requests
from pathlib import Path
from datetime import datetime, timedelta
import os
xml_url = os.getenv("XML_URL")
import sys
DATA_DIR = Path("data")
DATA_DIR.mkdir(parents=True, exist_ok=True)
local_path = DATA_DIR / "epg.xml"
def download_epg():
try:
resp = requests.get(xml_url, timeout=30)
resp.raise_for_status()
local_path.write_bytes(resp.content)
print(f"EPG saved to {local_path}")
except Exception as e:
print(f"Failed to download EPG: {e}", file=sys.stderr)
# If EPG is required for later steps, you may want to sys.exit(1)
download_epg()
def run(mod):
print(mod.upper())
rc = os.system(f"python -m src.{mod}")
if rc != 0:
print(f"ERROR in {mod} (exit {rc})")
for mod in ["mlb", "epl", "ufc"]:
run(mod)
##################
##### SLEEPY #####
##################
print(datetime.now())
now = datetime.now()
manana0 = datetime.now()+timedelta(days=1)
manana = datetime(int(manana0.year),int(manana0.month),int(manana0.day),7,30)
print('sleeping until '+str(manana))
print(((manana-now).days*24*60*60)+((manana-now).seconds))
time.sleep(((manana-now).days*24*60*60)+((manana-now).seconds))
#del game_url
#del main_soup
print(datetime.now())

0
src/__init__.py Normal file
View File

191
src/epl.py Normal file
View File

@@ -0,0 +1,191 @@
from bs4 import BeautifulSoup
import pandas as pd
import re
from datetime import date
from lxml import etree
import pandas as pd
import re
from pathlib import Path
from datetime import datetime, date, time, timedelta, timezone
from zoneinfo import ZoneInfo # Python 3.9+
from great_tables import GT, md
import os
mlb_sched_url = os.getenv("EPL_SCHED_URL")
import requests
import cloudscraper
driver = 'chrome'
import pandas as pd
from bs4 import BeautifulSoup
import pytz
# Load the HTML file
headers = {
'Access-Control-Allow-Origin': '*',
'Access-Control-Allow-Methods': 'GET',
'Access-Control-Allow-Headers': 'Content-Type',
'Access-Control-Max-Age': '3600',
'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0'
}
url = f'https://fbref.com/en/comps/9/schedule/Premier-League-Scores-and-Fixtures'
# Fetch the webpage content
scraper = cloudscraper.create_scraper()
response = scraper.get(url)
# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(response.content, 'html.parser')
# Extract tables
tables = pd.read_html(response.text)
soccer_schedule = tables[0]
# --- Clean up Date/Time ---
# Combine Date + Time
soccer_schedule["DateTime_UK"] = pd.to_datetime(
soccer_schedule["Date"].astype(str) + " " + soccer_schedule["Time"].astype(str),
errors="coerce"
)
# Localize to UK time
uk = pytz.timezone("Europe/London")
ny = pytz.timezone("America/New_York")
soccer_schedule["DateTime_UK"] = soccer_schedule["DateTime_UK"].dt.tz_localize(uk, ambiguous="NaT", nonexistent="NaT")
# Convert to New York time
soccer_schedule["DateTime_NY"] = soccer_schedule["DateTime_UK"].dt.tz_convert(ny)
# Format for display in am/pm style
soccer_schedule["NY_Time"] = soccer_schedule["DateTime_NY"].dt.strftime("%I:%M %p")
# Drop leading zeros from hour (optional)
soccer_schedule["NY_Time"] = soccer_schedule["NY_Time"].str.lstrip("0")
# Show final
soccer_schedule = soccer_schedule.replace("Nott'ham Forest","Nottingham Forest")
import pandas as pd
import xml.etree.ElementTree as ET
from datetime import date
# Fix team naming
soccer_schedule = soccer_schedule.replace("Nott'ham Forest", "Nottingham Forest")
soccer_schedule = soccer_schedule.replace("Newcastle Utd", "Newcastle United")
soccer_schedule = soccer_schedule.replace("Wolves", "Wolverhampton Wanderers")
# Filter today's schedule & keep merge_key
df_today = (
soccer_schedule[
pd.to_datetime(soccer_schedule.Date) == pd.to_datetime(date.today())
]
.reset_index()
.rename({'index': 'merge_key'}, axis='columns')
)
df_today = df_today[['Date', 'NY_Time', 'Away', 'Home', 'Wk', 'merge_key']]
if len(df_today)>0:
# Load XML
file_path = "data/epg.xml" # replace with actual path
tree = ET.parse(file_path)
root = tree.getroot()
data = []
# Iterate schedule rows
for _, row in df_today.iterrows():
home = str(row["Home"]).strip()
away = str(row["Away"]).strip()
for prog in root.findall("programme"):
title = prog.find("title").text if prog.find("title") is not None else ""
desc = prog.find("desc").text if prog.find("desc") is not None else ""
# Keep only if Premier League and both team names appear
if "Premier League" in title and home in title and away in title:
data.append({
"merge_key": row["merge_key"], # ✅ carry over merge_key
"Schedule_Home": home,
"Schedule_Away": away,
"Start": prog.attrib.get("start"),
"Stop": prog.attrib.get("stop"),
"Channel": prog.attrib.get("channel"),
"Title": title.strip() })
# Create DataFrame
df_matches = pd.DataFrame(data)
# Convert start/stop to datetime
df_matches["Start"] = pd.to_datetime(df_matches["Start"].str[:14], format="%Y%m%d%H%M%S", errors="coerce", utc=True)
df_matches["Stop"] = pd.to_datetime(df_matches["Stop"].str[:14], format="%Y%m%d%H%M%S", errors="coerce", utc=True)
df = pd.merge(
df_today,
df_matches[['merge_key','Schedule_Home','Schedule_Away','Channel']].drop_duplicates(),
how='left',
on='merge_key'
)
####
for game in df['merge_key'].unique():
temp = df[df['merge_key']==game].drop_duplicates().reset_index(drop=True)
temp = temp.rename({'NY_Time':'Time'},axis='columns')
temp['Wk'] = temp.Wk.astype('int')
temp2 = temp[['Channel']].drop_duplicates().reset_index(drop=True)
# Split once on "." into two columns
split_cols = temp2['Channel'].str.split('.', n=1, expand=True)
temp2['Channel'] = split_cols[0] # part before "."
temp2['Country'] = split_cols[1].str.upper().fillna("") # part after ".", or "" if missing
# Reorder
temp2 = temp2[['Country', 'Channel']]
# Regex pattern: 'FOX' + 4 uppercase letters (total length 7, all caps)
temp2['Country'] = temp2['Country'].replace('US2','US')
pattern = re.compile(r"^NBC[A-Z]{4}$")
# Replace matching channels with 'FOX'
temp2["Channel"] = temp2["Channel"].apply(lambda x: "NBC" if pattern.match(x) else x)
pattern = re.compile(r"^NBC[A-Z]{5}$")
# Replace matching channels with 'FOX'
temp2["Channel"] = temp2["Channel"].apply(lambda x: "NBC" if pattern.match(x) else x)
pattern = re.compile(r"^NBC[A-Z]{6}$")
# Replace matching channels with 'FOX'
temp2["Channel"] = temp2["Channel"].apply(lambda x: "NBC" if pattern.match(x) else x)
pattern = re.compile(r"^NBC[A-Z]{3}$")
# Replace matching channels with 'FOX'
temp2["Channel"] = temp2["Channel"].apply(lambda x: "NBC" if pattern.match(x) else x)
temp2 = temp2.drop_duplicates()
###
game_table = (
GT(temp2.drop_duplicates())
.tab_header(
title=f"{temp['Away'][0]} @ {temp['Home'][0]}",
subtitle=f"Matchweek {temp.Wk[0]} {temp['Time'][0]} {str(temp['Date'][0]).split(' ')[0]}",
).tab_source_note(md(' '))
)
game_table_image_path = 'data/epl.png'
game_table.save(game_table_image_path,window_size=(1000, 1000),web_driver=driver)
with open(game_table_image_path, 'rb') as image_file:
# Prepare the payload with the file
payload = {
'file': image_file
}
# Send the POST request to the webhook
response = requests.post(epl_sched_url, files=payload)
os.remove(game_table_image_path)

333
src/mlb.py Normal file
View File

@@ -0,0 +1,333 @@
from bs4 import BeautifulSoup
import pandas as pd
import re
from datetime import date
from lxml import etree
import pandas as pd
import re
from pathlib import Path
from datetime import datetime, date, time, timedelta, timezone
from zoneinfo import ZoneInfo # Python 3.9+
from great_tables import GT, md
import os
mlb_sched_url = os.getenv("MLB_SCHED_URL")
import requests
import cloudscraper
driver = 'chrome'
year = date.today().year
# Load the HTML file
headers = {
'Access-Control-Allow-Origin': '*',
'Access-Control-Allow-Methods': 'GET',
'Access-Control-Allow-Headers': 'Content-Type',
'Access-Control-Max-Age': '3600',
'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0'
}
url = f'https://www.baseball-reference.com/leagues/majors/{year}-schedule.shtml'
# Fetch the webpage content
scraper = cloudscraper.create_scraper()
response = scraper.get(url)
# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(response.content, 'html.parser')
today_section = soup.find("span", id="today")
data = []
if today_section:
for game in today_section.find_all_next("p", class_="game"):
gm_time = game.find("strong").get_text(strip=True) if game.find("strong") else None
teams = game.find_all("a")
if len(teams) >= 2:
away_team = teams[0].get_text(strip=True)
home_team = teams[1].get_text(strip=True)
else:
away_team = home_team = None
preview_tag = game.find("em")
preview_link = preview_tag.a["href"] if preview_tag and preview_tag.a else None
# Extract date from preview link if present
game_date = None
if preview_link:
game_date = pd.to_datetime(preview_link.split('/')[3][3:11])
data.append({
"Time": gm_time,
"Away Team": away_team,
"Home Team": home_team,
"Preview Link": preview_link,
"Game Date": game_date
})
sched = pd.DataFrame(data)
sched = sched[sched['Game Date']==str(date.today())]
if len(sched)>0:
sched['Away Team'] = sched['Away Team'].replace("Arizona D'Backs","Arizona Diamondbacks")
sched['Home Team'] = sched['Home Team'].replace("Arizona D'Backs","Arizona Diamondbacks")
##############
teams = list(pd.concat([sched['Away Team'], sched['Home Team'] ]))
##############
# --- point this to your file ---
xml_path = Path("data/epg.xml")
# Parse with lxml recovery; wrap to guarantee a single root
parser = etree.XMLParser(recover=True, encoding="utf-8")
tree = etree.fromstring(b"<root>" + xml_path.read_bytes() + b"</root>", parser)
# Filters
#teams = ["Philadelphia Phillies", "Boston Red Sox"]
# Regex for filtering (any team)
pattern = re.compile("|".join(map(re.escape, teams)), re.IGNORECASE)
# Helper: find which team(s) appear in title, return a list (no duplicates, canonical casing)
def find_teams_list(title: str, teams_list):
found = []
for t in teams_list:
if re.search(re.escape(t), title, re.IGNORECASE):
found.append(t)
# Ensure max 2 and stable order as in teams_list
return found[:2]
# Collect rows
rows = []
for prog in tree.findall(".//programme"):
title_el = prog.find("title")
title = (title_el.text or "").strip() if title_el is not None else ""
if not pattern.search(title):
continue
teams_found_list = find_teams_list(title, teams)
if not teams_found_list: # safety
continue
start_raw = prog.get("start") or "" # e.g. "20250910221000 +0000"
channel = prog.get("channel")
# Parse start as UTC (first 14 chars = YYYYMMDDHHMMSS)
try:
dt_utc = datetime.strptime(start_raw[:14], "%Y%m%d%H%M%S").replace(tzinfo=timezone.utc)
except Exception:
continue
rows.append({
"title": title,
"start": start_raw,
"channel": channel,
"start_dt_utc": dt_utc,
# temporarily store full list; we'll split later
"teams_found_list": teams_found_list
})
tv = pd.DataFrame(rows)
if tv.empty:
print(tv)
else:
# Convert to local time (America/New_York)
local_tz = ZoneInfo("America/New_York")
tv["start_dt_local"] = tv["start_dt_utc"].dt.tz_convert(local_tz)
# Compute today's local window
today_local = datetime.now(local_tz).date()
start_of_day = datetime.combine(today_local, time.min, tzinfo=local_tz)
end_of_day = start_of_day + timedelta(days=1)
# Filter to events whose local start falls on "today"
mask = (tv["start_dt_local"] >= start_of_day) & (tv["start_dt_local"] < end_of_day)
df_today = tv.loc[mask, ["title", "start", "channel", "teams_found_list"]].reset_index(drop=True)
# ---- split start into local_date/local_time columns ----
def split_start_to_local_date_time(start_str: str, tz_str: str = "America/New_York"):
if not start_str:
return None, None
dt_utc = datetime.strptime(start_str[:14], "%Y%m%d%H%M%S").replace(tzinfo=timezone.utc)
local_dt = dt_utc.astimezone(ZoneInfo(tz_str))
return local_dt.date(), local_dt.time()
df_today[["local_date", "local_time"]] = df_today["start"].apply(
lambda s: pd.Series(split_start_to_local_date_time(s))
)
# ---- create team_found_1 and team_found_2 columns ----
def to_two_cols(lst):
# pad to length 2 with None
padded = (lst + [None, None])[:2]
return pd.Series(padded, index=["team_found_1", "team_found_2"])
df_today[["team_found_1", "team_found_2"]] = df_today["teams_found_list"].apply(to_two_cols)
df_today = df_today.drop(columns=["teams_found_list"])
# reorder columns if you like
df_today = df_today[["title", "channel", "start", "local_date", "local_time", "team_found_1", "team_found_2"]]
# Optional: df_today.to_csv("filtered_programmes_today.csv", index=False)
#####
########
df = pd.merge(
sched,
df_today[[
'channel',
'local_time',
'team_found_1'
]].drop_duplicates().rename({
'team_found_1':'Away Team'
},axis='columns'),
how='left',
on='Away Team'
)
#########
df = df.copy() # if your df is named something else, replace here
# 1) Build the "game start" local datetime from Game Date + Time (e.g., "2025-09-10" + "2:35 pm")
df["Game Date"] = pd.to_datetime(df["Game Date"], errors="coerce")
df["Time"] = df["Time"].astype(str).str.strip()
game_dt = pd.to_datetime(
df["Game Date"].dt.strftime("%Y-%m-%d") + " " + df["Time"],
format="%Y-%m-%d %I:%M %p",
errors="coerce"
)
# 2) Parse away/home local times (strings like "14:30:00") and combine with Game Date
def combine_date_and_hms(date_series, hms_series):
# parse HH:MM:SS (coerce invalid to NaT)
parsed = pd.to_datetime(hms_series.astype(str).str.strip(), format="%H:%M:%S", errors="coerce")
# keep only time-of-day; combine with date
return pd.to_datetime(date_series.dt.strftime("%Y-%m-%d") + " " + parsed.dt.strftime("%H:%M:%S"), errors="coerce")
away_dt = combine_date_and_hms(df["Game Date"], df["local_time"])
# 3) Filter: local_time_Away > Time OR local_time_Home > Time
mask = (away_dt >= game_dt - timedelta(hours=1)) & (away_dt <= game_dt + timedelta(hours=4))
filtered = df.loc[mask].copy()
df = pd.merge(
sched,
df_today[[
'channel',
'local_time',
'team_found_2'
]].drop_duplicates().rename({
'team_found_2':'Home Team'
},axis='columns'),
how='left',
on='Home Team',
suffixes=['_Away','_Home']
)
######
df = df.copy() # if your df is named something else, replace here
# 1) Build the "game start" local datetime from Game Date + Time (e.g., "2025-09-10" + "2:35 pm")
df["Game Date"] = pd.to_datetime(df["Game Date"], errors="coerce")
df["Time"] = df["Time"].astype(str).str.strip()
game_dt = pd.to_datetime(
df["Game Date"].dt.strftime("%Y-%m-%d") + " " + df["Time"],
format="%Y-%m-%d %I:%M %p",
errors="coerce"
)
# 2) Parse away/home local times (strings like "14:30:00") and combine with Game Date
def combine_date_and_hms(date_series, hms_series):
# parse HH:MM:SS (coerce invalid to NaT)
parsed = pd.to_datetime(hms_series.astype(str).str.strip(), format="%H:%M:%S", errors="coerce")
# keep only time-of-day; combine with date
return pd.to_datetime(date_series.dt.strftime("%Y-%m-%d") + " " + parsed.dt.strftime("%H:%M:%S"), errors="coerce")
home_dt = combine_date_and_hms(df["Game Date"], df["local_time"])
# 3) Filter: local_time_Away > Time OR local_time_Home > Time
mask = (home_dt >= game_dt - timedelta(hours=1)) & (home_dt <= game_dt + timedelta(hours=4))
filtered2 = df.loc[mask].copy()
######
df = pd.merge(
filtered,
filtered2,
how='left',
on=['Time','Away Team','Home Team','Preview Link','Game Date'],
suffixes=['_Home','_Away']
)
#####
for game in df['Preview Link'].unique():
temp = df[df['Preview Link']==game].drop_duplicates().reset_index(drop=True)
temp2 = pd.DataFrame({
'Channel':list(temp['channel_Home'].dropna())+list(temp['channel_Away'].dropna())
})
temp2 = temp2.drop_duplicates().reset_index(drop=True)
temp2['Country'] = [x.split('.')[1].upper() for x in temp2.Channel]
temp2['Channel'] = [x.split('.')[0] for x in temp2.Channel]
temp2 = temp2[['Country','Channel']]
# Regex pattern: 'FOX' + 4 uppercase letters (total length 7, all caps)
pattern = re.compile(r"^FOX[A-Z]{4}$")
# Replace matching channels with 'FOX'
temp2['Country'] = temp2['Country'].replace('US2','US')
temp2["Channel"] = temp2["Channel"].apply(lambda x: "FOX" if pattern.match(x) else x)
temp2 = temp2.drop_duplicates()
game_table = (
GT(temp2.drop_duplicates())
.tab_header(
title=f"{temp['Away Team'][0]} @ {temp['Home Team'][0]}",
subtitle=f"{temp['Time'][0]} {str(temp['Game Date'][0]).split(' ')[0]}",
).tab_source_note(md(' '))
)
game_table_image_path = 'data/mlb.png'
game_table.save(game_table_image_path,window_size=(1000, 1000),web_driver=driver)
with open(game_table_image_path, 'rb') as image_file:
# Prepare the payload with the file
payload = {
'file': image_file
}
# Send the POST request to the webhook
response = requests.post(mlb_sched_url, files=payload)
os.remove(game_table_image_path)

294
src/ufc.py Normal file
View File

@@ -0,0 +1,294 @@
from bs4 import BeautifulSoup
import pandas as pd
import re
from datetime import date
from lxml import etree
import pandas as pd
import re
from pathlib import Path
from datetime import datetime, date, time, timedelta, timezone
from zoneinfo import ZoneInfo # Python 3.9+
from great_tables import GT, md
import os
mlb_sched_url = os.getenv("UFC_SCHED_URL")
import requests
import cloudscraper
driver = 'chrome'
#########
from bs4 import BeautifulSoup, Tag
import pandas as pd
import os
import re
# --- load the HTML ---
# Load the HTML file
headers = {
'Access-Control-Allow-Origin': '*',
'Access-Control-Allow-Methods': 'GET',
'Access-Control-Allow-Headers': 'Content-Type',
'Access-Control-Max-Age': '3600',
'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0'
}
url = f'https://www.mmafighting.com/schedule/ufc'
# Fetch the webpage content
scraper = cloudscraper.create_scraper()
response = scraper.get(url)
# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(response.content, 'html.parser')
events, dates, card_types, fights = [], [], [], []
main_starts, prelim_starts = [], []
def get_card_type(li: Tag) -> str:
"""Find the nearest split section header (Main Card / Undercard)."""
split = li.find_parent("div", class_="m-mmaf-pte-event-list__split-item")
if split:
h4 = split.find("h4")
if h4:
return h4.get_text(strip=True)
return "Main Card" # default for top-level <ul> blocks
_time_pat = r"([0-9]{1,2}(?::[0-9]{2})?\s*[ap]\.m\.)"
def normalize_time(t: str | None) -> str | None:
if not t:
return None
s = t.lower()
s = re.sub(r"\s*a\.m\.", "am", s)
s = re.sub(r"\s*p\.m\.", "pm", s)
s = s.replace(" ", "")
return s # e.g., "6pm", "7:30pm"
def extract_event_times(section_nodes: list[Tag]) -> tuple[str | None, str | None]:
"""Look for the tv-info <p> within this event's own section only."""
tv_text = None
for n in section_nodes:
if isinstance(n, Tag) and n.name == "p" and "m-mmaf-pte-event-list__tv-info" in (n.get("class") or []):
tv_text = n.get_text(" ", strip=True).lower()
break
if not tv_text:
return None, None
main_m = re.search(r"main\s*card[^;]*?\bat\s*" + _time_pat, tv_text)
prelim_m = re.search(r"prelims?[^;]*?\bat\s*" + _time_pat, tv_text)
main_time = normalize_time(main_m.group(1)) if main_m else None
prelim_time = normalize_time(prelim_m.group(1)) if prelim_m else None
return main_time, prelim_time
# The schedule list wrapper(s)
wrappers = soup.find_all("div", class_="m-mmaf-pte-event-list")
for wrapper in wrappers:
# iterate each event header inside the wrapper
for h2 in wrapper.find_all("h2"):
event_name = h2.get_text(strip=True)
# the date for this event is the next sibling <h3>
date_tag = h2.find_next_sibling("h3")
event_date = date_tag.get_text(strip=True) if date_tag else ""
# gather siblings after date_tag (or h2 if no date) until the next <h2>
start_from = date_tag if date_tag else h2
section_nodes = []
for sib in start_from.next_siblings:
if isinstance(sib, Tag) and sib.name == "h2":
break
section_nodes.append(sib)
# extract TV times **for this event only**
main_time, prelim_time = extract_event_times(section_nodes)
# within this events section, collect every <li> fight
for node in section_nodes:
if not isinstance(node, Tag):
continue
for li in node.find_all("li"):
# prefer the anchor text; fallback to full li text
a = li.find("a")
fight_text = a.get_text(" ", strip=True) if a else li.get_text(" ", strip=True)
# strip any trailing labels like "Title Fight"
fight_text = fight_text.replace("Title Fight", "").strip()
events.append(event_name)
dates.append(event_date)
card_types.append(get_card_type(li))
fights.append(fight_text)
main_starts.append(main_time)
prelim_starts.append(prelim_time)
# build dataframe
sched = pd.DataFrame({
"Event": events,
"Date": dates,
"Card Type": card_types,
"Fight": fights,
"main_start": main_starts,
"prelim_start": prelim_starts
})
# Example filter: keep only Sept 13, 2025 (remove or adjust as needed)
sched = sched[
(pd.to_datetime(sched.Date) == pd.to_datetime(date.today()))
].reset_index(drop=True)
if len(sched)>0:
# group fights by consecutive card type (keep first two groups)
event=[0]
j=0
for i in range(1,len(sched)):
if sched['Card Type'][i] == sched['Card Type'][i-1]:
event.append(j)
else:
j += 1
event.append(j)
sched['i'] = event
sched = sched[sched['i'] < 2].drop(['i'], axis='columns')
###########
import xml.etree.ElementTree as ET
import unicodedata
import re
# --- normalization helpers ---
def normalize(text: str) -> str:
if not text:
return ""
text = text.lower()
text = unicodedata.normalize("NFKD", text)
text = "".join(c for c in text if not unicodedata.combining(c))
text = text.replace("vs.", "x")
text = re.sub(r"[^a-z0-9\s]", " ", text) # keep alphanum only
text = re.sub(r"\s+", " ", text).strip()
return text
def tokenize_event(event_title: str):
"""Return prefix tokens and headline tokens separately."""
parts = [p.strip() for p in event_title.split(":")]
if len(parts) == 2:
prefix = normalize(parts[0])
headline = normalize(parts[1])
else:
prefix, headline = "", normalize(event_title)
prefix_tokens = set(prefix.split()) - {"ufc", "fight", "night", "new"}
headline_tokens = set(headline.split()) - {"ufc", "fight", "night", "new"}
return prefix_tokens, headline_tokens
# --- example schedule event ---
event_title = "Noche UFC: Lopes vs. Silva"
prefix_tokens, headline_tokens = tokenize_event(event_title)
# --- parse XML ---
xml_file = "data/epg.xml"
tree = ET.parse(xml_file)
root = tree.getroot()
matches = []
for prog in root.findall("programme"):
title_el = prog.find("title")
desc_el = prog.find("desc")
title_text = title_el.text if (title_el is not None and title_el.text) else ""
desc_text = desc_el.text if (desc_el is not None and desc_el.text) else ""
full_text = normalize(title_text + " " + desc_text)
# check: require all headline tokens present in full_text
if all(tok in full_text for tok in headline_tokens):
matches.append({
"event": event_title,
"programme_title": title_text.strip(),
"desc": desc_text.strip(),
"start": prog.attrib.get("start"),
"stop": prog.attrib.get("stop"),
"channel": prog.attrib.get("channel")
})
################
tv = pd.DataFrame(matches)
tv = tv[['channel']].rename({'channel':'Channel'},axis='columns').drop_duplicates().reset_index(drop=True)
tv['Country'] = [x.split('.')[1].upper() for x in tv.Channel]
tv['Channel'] = [x.split('.')[0] for x in tv.Channel]
tv['Country'] = tv['Country'].astype(str).str.replace('1', '', regex=False)
################
from PIL import Image
game_table_image_path = "data/ufc.png"
game_table_image_path0 = "data/ufc_sched.png"
game_table_image_path1 = "data/ufc_tv.png"
# Save UFC schedule table
temp = sched[['Card Type','Fight']]
temp['Card Type'] = temp['Card Type'].mask(temp['Card Type'].duplicated(), '')
game_table = (
GT(temp)
.tab_header(
title=f"{sched['Event'][0]}",
subtitle=md(
f"{sched['Date'][0]} \nPrelims: {str(sched['prelim_start'][0]).split(' ')[0]} \nMain Event: {str(sched['main_start'][0]).split(' ')[0]}"
),
)
.tab_source_note(md(' '))
)
game_table.save(game_table_image_path0, window_size=(1000,1000), web_driver=driver)
# Render TV table separately
tv = pd.DataFrame(matches)
tv = tv[['channel']].rename({'channel':'Channel'},axis='columns').drop_duplicates().reset_index(drop=True)
tv['Country'] = [x.split('.')[1].upper() for x in tv.Channel]
tv['Channel'] = [x.split('.')[0] for x in tv.Channel]
tv['Country'] = tv['Country'].astype(str).str.replace('1', '', regex=False)
tv_table = GT(tv).tab_header(title="Broadcast Channels")
tv_table.save(game_table_image_path1, window_size=(800,400), web_driver=driver)
# Combine images vertically
imgs = [Image.open(x) for x in [game_table_image_path0,game_table_image_path1]]
width = max(i.width for i in imgs)
height = sum(i.height for i in imgs)
combined = Image.new("RGB", (width, height), "white")
y_offset = 0
for im in imgs:
combined.paste(im, (0, y_offset))
y_offset += im.height
combined.save(game_table_image_path)
########
with open(game_table_image_path, 'rb') as image_file:
# Prepare the payload with the file
payload = {
'file': image_file
}
# Send the POST request to the webhook
response = requests.post(ufc_sched_url, files=payload)
os.remove(game_table_image_path)
os.remove(game_table_image_path0)
os.remove(game_table_image_path1)