diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..be1a6e7 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,60 @@ +# Dockerfile +FROM python:3.11-slim + +ENV PYTHONDONTWRITEBYTECODE=1 \ + PYTHONUNBUFFERED=1 \ + PIP_NO_CACHE_DIR=1 \ + DEBIAN_FRONTEND=noninteractive \ + TZ=America/New_York + +# System deps for Chromium/chromedriver + rendering & lxml +RUN apt-get update && apt-get install -y --no-install-recommends \ + chromium \ + chromium-driver \ + ca-certificates \ + fonts-liberation \ + fonts-dejavu \ + libx11-6 \ + libxcomposite1 \ + libxdamage1 \ + libxext6 \ + libxfixes3 \ + libxrandr2 \ + libgbm1 \ + libgtk-3-0 \ + libnss3 \ + libasound2 \ + libatk-bridge2.0-0 \ + libatk1.0-0 \ + libcairo2 \ + libpango-1.0-0 \ + tzdata \ + build-essential \ + libxml2-dev \ + libxslt1-dev \ + && rm -rf /var/lib/apt/lists/* + +# Ensure Chromium is on a known path +ENV CHROME_BIN=/usr/bin/chromium \ + CHROMEDRIVER=/usr/bin/chromedriver + +WORKDIR /app + +# Copy deps first for better caching +COPY requirements.txt . +RUN pip install -r requirements.txt + +# Copy the app +COPY run.py ./run.py +COPY entrypoint.sh ./entrypoint.sh +COPY src ./src +COPY config ./config + +# Make sure data/ exists at runtime (also volume-mounted by compose) +RUN mkdir -p /app/data && chmod +x /app/entrypoint.sh + +# Non-root user (optional) +RUN useradd -ms /bin/bash appuser && chown -R appuser:appuser /app +USER appuser + +ENTRYPOINT ["/app/entrypoint.sh"] diff --git a/cronfile b/cronfile new file mode 100644 index 0000000..ddf3293 --- /dev/null +++ b/cronfile @@ -0,0 +1,2 @@ +# Run epg-runner every 5 minutes +*/5 * * * * docker compose run --rm epg-runner diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..063d89d --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,16 @@ +version: "3.8" +services: + epg-runner: + build: . + image: epg-runner:latest + container_name: epg-runner + environment: + TZ: America/New_York + XML_URL: "" + MLB_SCHED_URL: "" + EPL_SCHED_URL: "" + UFC_SCHED_URL: "" + volumes: + - ./data:/app/data + # Don’t start automatically, only when scheduler calls it + restart: "no" diff --git a/entrypoint.sh b/entrypoint.sh new file mode 100644 index 0000000..dab9bce --- /dev/null +++ b/entrypoint.sh @@ -0,0 +1,13 @@ +#!/usr/bin/env bash +set -euo pipefail + +# Create data dir if missing (also persisted via volume) +mkdir -p /app/data + +# Helpful debug: show versions +python --version +echo "Chromium: $(chromium --version || true)" +echo "Chromedriver: $(chromedriver --version || true)" + +# Run your task +exec python /app/run.py diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..abe9a90 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,9 @@ +beautifulsoup4>=4.12 +pandas>=2.2 +lxml>=5.2 +requests>=2.32 +cloudscraper>=1.2 +pytz>=2024.1 +Pillow>=10.4 +great-tables>=0.10.0 +selenium>=4.25 diff --git a/run.py b/run.py new file mode 100644 index 0000000..782356b --- /dev/null +++ b/run.py @@ -0,0 +1,50 @@ +# run.py +## Download EPG ## +import requests +from pathlib import Path +from datetime import datetime, timedelta +import os +xml_url = os.getenv("XML_URL") +import sys + +DATA_DIR = Path("data") +DATA_DIR.mkdir(parents=True, exist_ok=True) +local_path = DATA_DIR / "epg.xml" + +def download_epg(): + try: + resp = requests.get(xml_url, timeout=30) + resp.raise_for_status() + local_path.write_bytes(resp.content) + print(f"EPG saved to {local_path}") + except Exception as e: + print(f"Failed to download EPG: {e}", file=sys.stderr) + # If EPG is required for later steps, you may want to sys.exit(1) + +download_epg() + +def run(mod): + print(mod.upper()) + rc = os.system(f"python -m src.{mod}") + if rc != 0: + print(f"ERROR in {mod} (exit {rc})") + +for mod in ["mlb", "epl", "ufc"]: + run(mod) + +################## +##### SLEEPY ##### +################## + +print(datetime.now()) +now = datetime.now() +manana0 = datetime.now()+timedelta(days=1) +manana = datetime(int(manana0.year),int(manana0.month),int(manana0.day),7,30) +print('sleeping until '+str(manana)) +print(((manana-now).days*24*60*60)+((manana-now).seconds)) +time.sleep(((manana-now).days*24*60*60)+((manana-now).seconds)) +#del game_url +#del main_soup + +print(datetime.now()) + diff --git a/src/__init__.py b/src/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/epl.py b/src/epl.py new file mode 100644 index 0000000..ef10418 --- /dev/null +++ b/src/epl.py @@ -0,0 +1,191 @@ +from bs4 import BeautifulSoup +import pandas as pd +import re +from datetime import date +from lxml import etree +import pandas as pd +import re +from pathlib import Path +from datetime import datetime, date, time, timedelta, timezone +from zoneinfo import ZoneInfo # Python 3.9+ +from great_tables import GT, md +import os +mlb_sched_url = os.getenv("EPL_SCHED_URL") +import requests +import cloudscraper +driver = 'chrome' + + +import pandas as pd +from bs4 import BeautifulSoup +import pytz + +# Load the HTML file +headers = { + 'Access-Control-Allow-Origin': '*', + 'Access-Control-Allow-Methods': 'GET', + 'Access-Control-Allow-Headers': 'Content-Type', + 'Access-Control-Max-Age': '3600', + 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0' + } + +url = f'https://fbref.com/en/comps/9/schedule/Premier-League-Scores-and-Fixtures' +# Fetch the webpage content +scraper = cloudscraper.create_scraper() + +response = scraper.get(url) + +# Parse the HTML content using BeautifulSoup +soup = BeautifulSoup(response.content, 'html.parser') + +# Extract tables +tables = pd.read_html(response.text) +soccer_schedule = tables[0] + +# --- Clean up Date/Time --- +# Combine Date + Time +soccer_schedule["DateTime_UK"] = pd.to_datetime( + soccer_schedule["Date"].astype(str) + " " + soccer_schedule["Time"].astype(str), + errors="coerce" +) + +# Localize to UK time +uk = pytz.timezone("Europe/London") +ny = pytz.timezone("America/New_York") +soccer_schedule["DateTime_UK"] = soccer_schedule["DateTime_UK"].dt.tz_localize(uk, ambiguous="NaT", nonexistent="NaT") + +# Convert to New York time +soccer_schedule["DateTime_NY"] = soccer_schedule["DateTime_UK"].dt.tz_convert(ny) + +# Format for display in am/pm style +soccer_schedule["NY_Time"] = soccer_schedule["DateTime_NY"].dt.strftime("%I:%M %p") + +# Drop leading zeros from hour (optional) +soccer_schedule["NY_Time"] = soccer_schedule["NY_Time"].str.lstrip("0") + +# Show final +soccer_schedule = soccer_schedule.replace("Nott'ham Forest","Nottingham Forest") + + +import pandas as pd +import xml.etree.ElementTree as ET +from datetime import date + +# Fix team naming +soccer_schedule = soccer_schedule.replace("Nott'ham Forest", "Nottingham Forest") +soccer_schedule = soccer_schedule.replace("Newcastle Utd", "Newcastle United") +soccer_schedule = soccer_schedule.replace("Wolves", "Wolverhampton Wanderers") + +# Filter today's schedule & keep merge_key +df_today = ( + soccer_schedule[ + pd.to_datetime(soccer_schedule.Date) == pd.to_datetime(date.today()) + ] + .reset_index() + .rename({'index': 'merge_key'}, axis='columns') +) + +df_today = df_today[['Date', 'NY_Time', 'Away', 'Home', 'Wk', 'merge_key']] + +if len(df_today)>0: + # Load XML + file_path = "data/epg.xml" # replace with actual path + tree = ET.parse(file_path) + root = tree.getroot() + + data = [] + + # Iterate schedule rows + for _, row in df_today.iterrows(): + home = str(row["Home"]).strip() + away = str(row["Away"]).strip() + + for prog in root.findall("programme"): + title = prog.find("title").text if prog.find("title") is not None else "" + desc = prog.find("desc").text if prog.find("desc") is not None else "" + + # Keep only if Premier League and both team names appear + if "Premier League" in title and home in title and away in title: + data.append({ + "merge_key": row["merge_key"], # ✅ carry over merge_key + "Schedule_Home": home, + "Schedule_Away": away, + "Start": prog.attrib.get("start"), + "Stop": prog.attrib.get("stop"), + "Channel": prog.attrib.get("channel"), + "Title": title.strip() }) + + # Create DataFrame + df_matches = pd.DataFrame(data) + + # Convert start/stop to datetime + df_matches["Start"] = pd.to_datetime(df_matches["Start"].str[:14], format="%Y%m%d%H%M%S", errors="coerce", utc=True) + df_matches["Stop"] = pd.to_datetime(df_matches["Stop"].str[:14], format="%Y%m%d%H%M%S", errors="coerce", utc=True) + + df = pd.merge( + df_today, + df_matches[['merge_key','Schedule_Home','Schedule_Away','Channel']].drop_duplicates(), + how='left', + on='merge_key' + ) + + + #### + + for game in df['merge_key'].unique(): + temp = df[df['merge_key']==game].drop_duplicates().reset_index(drop=True) + temp = temp.rename({'NY_Time':'Time'},axis='columns') + temp['Wk'] = temp.Wk.astype('int') + + temp2 = temp[['Channel']].drop_duplicates().reset_index(drop=True) + + # Split once on "." into two columns + split_cols = temp2['Channel'].str.split('.', n=1, expand=True) + temp2['Channel'] = split_cols[0] # part before "." + temp2['Country'] = split_cols[1].str.upper().fillna("") # part after ".", or "" if missing + + # Reorder + temp2 = temp2[['Country', 'Channel']] + + # Regex pattern: 'FOX' + 4 uppercase letters (total length 7, all caps) + temp2['Country'] = temp2['Country'].replace('US2','US') + pattern = re.compile(r"^NBC[A-Z]{4}$") + # Replace matching channels with 'FOX' + temp2["Channel"] = temp2["Channel"].apply(lambda x: "NBC" if pattern.match(x) else x) + pattern = re.compile(r"^NBC[A-Z]{5}$") + # Replace matching channels with 'FOX' + temp2["Channel"] = temp2["Channel"].apply(lambda x: "NBC" if pattern.match(x) else x) + pattern = re.compile(r"^NBC[A-Z]{6}$") + # Replace matching channels with 'FOX' + temp2["Channel"] = temp2["Channel"].apply(lambda x: "NBC" if pattern.match(x) else x) + pattern = re.compile(r"^NBC[A-Z]{3}$") + # Replace matching channels with 'FOX' + temp2["Channel"] = temp2["Channel"].apply(lambda x: "NBC" if pattern.match(x) else x) + + + temp2 = temp2.drop_duplicates() + ### + + game_table = ( + GT(temp2.drop_duplicates()) + .tab_header( + title=f"{temp['Away'][0]} @ {temp['Home'][0]}", + subtitle=f"Matchweek {temp.Wk[0]} {temp['Time'][0]} {str(temp['Date'][0]).split(' ')[0]}", + ).tab_source_note(md(' ')) + ) + game_table_image_path = 'data/epl.png' + game_table.save(game_table_image_path,window_size=(1000, 1000),web_driver=driver) + + with open(game_table_image_path, 'rb') as image_file: + # Prepare the payload with the file + payload = { + 'file': image_file + } + + # Send the POST request to the webhook + response = requests.post(epl_sched_url, files=payload) + + + os.remove(game_table_image_path) + + diff --git a/src/mlb.py b/src/mlb.py new file mode 100644 index 0000000..4b4b2b7 --- /dev/null +++ b/src/mlb.py @@ -0,0 +1,333 @@ +from bs4 import BeautifulSoup +import pandas as pd +import re +from datetime import date +from lxml import etree +import pandas as pd +import re +from pathlib import Path +from datetime import datetime, date, time, timedelta, timezone +from zoneinfo import ZoneInfo # Python 3.9+ +from great_tables import GT, md +import os +mlb_sched_url = os.getenv("MLB_SCHED_URL") +import requests +import cloudscraper +driver = 'chrome' + +year = date.today().year +# Load the HTML file +headers = { + 'Access-Control-Allow-Origin': '*', + 'Access-Control-Allow-Methods': 'GET', + 'Access-Control-Allow-Headers': 'Content-Type', + 'Access-Control-Max-Age': '3600', + 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0' + } + +url = f'https://www.baseball-reference.com/leagues/majors/{year}-schedule.shtml' +# Fetch the webpage content +scraper = cloudscraper.create_scraper() + +response = scraper.get(url) + +# Parse the HTML content using BeautifulSoup +soup = BeautifulSoup(response.content, 'html.parser') + + +today_section = soup.find("span", id="today") + +data = [] +if today_section: + for game in today_section.find_all_next("p", class_="game"): + gm_time = game.find("strong").get_text(strip=True) if game.find("strong") else None + teams = game.find_all("a") + if len(teams) >= 2: + away_team = teams[0].get_text(strip=True) + home_team = teams[1].get_text(strip=True) + else: + away_team = home_team = None + + preview_tag = game.find("em") + preview_link = preview_tag.a["href"] if preview_tag and preview_tag.a else None + + # Extract date from preview link if present + game_date = None + if preview_link: + game_date = pd.to_datetime(preview_link.split('/')[3][3:11]) + + data.append({ + "Time": gm_time, + "Away Team": away_team, + "Home Team": home_team, + "Preview Link": preview_link, + "Game Date": game_date + }) + +sched = pd.DataFrame(data) +sched = sched[sched['Game Date']==str(date.today())] + +if len(sched)>0: + + sched['Away Team'] = sched['Away Team'].replace("Arizona D'Backs","Arizona Diamondbacks") + sched['Home Team'] = sched['Home Team'].replace("Arizona D'Backs","Arizona Diamondbacks") + + + ############## + + teams = list(pd.concat([sched['Away Team'], sched['Home Team'] ])) + + ############## + + + + + # --- point this to your file --- + xml_path = Path("data/epg.xml") + + # Parse with lxml recovery; wrap to guarantee a single root + parser = etree.XMLParser(recover=True, encoding="utf-8") + tree = etree.fromstring(b"" + xml_path.read_bytes() + b"", parser) + + # Filters + #teams = ["Philadelphia Phillies", "Boston Red Sox"] + + # Regex for filtering (any team) + pattern = re.compile("|".join(map(re.escape, teams)), re.IGNORECASE) + + # Helper: find which team(s) appear in title, return a list (no duplicates, canonical casing) + def find_teams_list(title: str, teams_list): + found = [] + for t in teams_list: + if re.search(re.escape(t), title, re.IGNORECASE): + found.append(t) + # Ensure max 2 and stable order as in teams_list + return found[:2] + + # Collect rows + rows = [] + for prog in tree.findall(".//programme"): + title_el = prog.find("title") + title = (title_el.text or "").strip() if title_el is not None else "" + if not pattern.search(title): + continue + + teams_found_list = find_teams_list(title, teams) + if not teams_found_list: # safety + continue + + start_raw = prog.get("start") or "" # e.g. "20250910221000 +0000" + channel = prog.get("channel") + + # Parse start as UTC (first 14 chars = YYYYMMDDHHMMSS) + try: + dt_utc = datetime.strptime(start_raw[:14], "%Y%m%d%H%M%S").replace(tzinfo=timezone.utc) + except Exception: + continue + + rows.append({ + "title": title, + "start": start_raw, + "channel": channel, + "start_dt_utc": dt_utc, + # temporarily store full list; we'll split later + "teams_found_list": teams_found_list + }) + + tv = pd.DataFrame(rows) + + if tv.empty: + print(tv) + else: + # Convert to local time (America/New_York) + local_tz = ZoneInfo("America/New_York") + tv["start_dt_local"] = tv["start_dt_utc"].dt.tz_convert(local_tz) + + # Compute today's local window + today_local = datetime.now(local_tz).date() + start_of_day = datetime.combine(today_local, time.min, tzinfo=local_tz) + end_of_day = start_of_day + timedelta(days=1) + + # Filter to events whose local start falls on "today" + mask = (tv["start_dt_local"] >= start_of_day) & (tv["start_dt_local"] < end_of_day) + df_today = tv.loc[mask, ["title", "start", "channel", "teams_found_list"]].reset_index(drop=True) + + # ---- split start into local_date/local_time columns ---- + def split_start_to_local_date_time(start_str: str, tz_str: str = "America/New_York"): + if not start_str: + return None, None + dt_utc = datetime.strptime(start_str[:14], "%Y%m%d%H%M%S").replace(tzinfo=timezone.utc) + local_dt = dt_utc.astimezone(ZoneInfo(tz_str)) + return local_dt.date(), local_dt.time() + + df_today[["local_date", "local_time"]] = df_today["start"].apply( + lambda s: pd.Series(split_start_to_local_date_time(s)) + ) + + # ---- create team_found_1 and team_found_2 columns ---- + def to_two_cols(lst): + # pad to length 2 with None + padded = (lst + [None, None])[:2] + return pd.Series(padded, index=["team_found_1", "team_found_2"]) + + df_today[["team_found_1", "team_found_2"]] = df_today["teams_found_list"].apply(to_two_cols) + df_today = df_today.drop(columns=["teams_found_list"]) + + # reorder columns if you like + df_today = df_today[["title", "channel", "start", "local_date", "local_time", "team_found_1", "team_found_2"]] + + # Optional: df_today.to_csv("filtered_programmes_today.csv", index=False) + + + ##### + + + ######## + df = pd.merge( + sched, + df_today[[ + 'channel', + 'local_time', + 'team_found_1' + ]].drop_duplicates().rename({ + 'team_found_1':'Away Team' + },axis='columns'), + how='left', + on='Away Team' + + ) + + ######### + df = df.copy() # if your df is named something else, replace here + + # 1) Build the "game start" local datetime from Game Date + Time (e.g., "2025-09-10" + "2:35 pm") + df["Game Date"] = pd.to_datetime(df["Game Date"], errors="coerce") + df["Time"] = df["Time"].astype(str).str.strip() + + game_dt = pd.to_datetime( + df["Game Date"].dt.strftime("%Y-%m-%d") + " " + df["Time"], + format="%Y-%m-%d %I:%M %p", + errors="coerce" + ) + + # 2) Parse away/home local times (strings like "14:30:00") and combine with Game Date + def combine_date_and_hms(date_series, hms_series): + # parse HH:MM:SS (coerce invalid to NaT) + parsed = pd.to_datetime(hms_series.astype(str).str.strip(), format="%H:%M:%S", errors="coerce") + # keep only time-of-day; combine with date + return pd.to_datetime(date_series.dt.strftime("%Y-%m-%d") + " " + parsed.dt.strftime("%H:%M:%S"), errors="coerce") + + away_dt = combine_date_and_hms(df["Game Date"], df["local_time"]) + + # 3) Filter: local_time_Away > Time OR local_time_Home > Time + mask = (away_dt >= game_dt - timedelta(hours=1)) & (away_dt <= game_dt + timedelta(hours=4)) + filtered = df.loc[mask].copy() + + df = pd.merge( + sched, + df_today[[ + 'channel', + 'local_time', + 'team_found_2' + ]].drop_duplicates().rename({ + 'team_found_2':'Home Team' + },axis='columns'), + how='left', + on='Home Team', + suffixes=['_Away','_Home'] + + ) + + ###### + df = df.copy() # if your df is named something else, replace here + + # 1) Build the "game start" local datetime from Game Date + Time (e.g., "2025-09-10" + "2:35 pm") + df["Game Date"] = pd.to_datetime(df["Game Date"], errors="coerce") + df["Time"] = df["Time"].astype(str).str.strip() + + game_dt = pd.to_datetime( + df["Game Date"].dt.strftime("%Y-%m-%d") + " " + df["Time"], + format="%Y-%m-%d %I:%M %p", + errors="coerce" + ) + + # 2) Parse away/home local times (strings like "14:30:00") and combine with Game Date + def combine_date_and_hms(date_series, hms_series): + # parse HH:MM:SS (coerce invalid to NaT) + parsed = pd.to_datetime(hms_series.astype(str).str.strip(), format="%H:%M:%S", errors="coerce") + # keep only time-of-day; combine with date + return pd.to_datetime(date_series.dt.strftime("%Y-%m-%d") + " " + parsed.dt.strftime("%H:%M:%S"), errors="coerce") + + home_dt = combine_date_and_hms(df["Game Date"], df["local_time"]) + + # 3) Filter: local_time_Away > Time OR local_time_Home > Time + mask = (home_dt >= game_dt - timedelta(hours=1)) & (home_dt <= game_dt + timedelta(hours=4)) + filtered2 = df.loc[mask].copy() + ###### + + + df = pd.merge( + filtered, + filtered2, + how='left', + on=['Time','Away Team','Home Team','Preview Link','Game Date'], + suffixes=['_Home','_Away'] + ) + + ##### + + for game in df['Preview Link'].unique(): + temp = df[df['Preview Link']==game].drop_duplicates().reset_index(drop=True) + + temp2 = pd.DataFrame({ + 'Channel':list(temp['channel_Home'].dropna())+list(temp['channel_Away'].dropna()) + }) + temp2 = temp2.drop_duplicates().reset_index(drop=True) + temp2['Country'] = [x.split('.')[1].upper() for x in temp2.Channel] + temp2['Channel'] = [x.split('.')[0] for x in temp2.Channel] + temp2 = temp2[['Country','Channel']] + + # Regex pattern: 'FOX' + 4 uppercase letters (total length 7, all caps) + pattern = re.compile(r"^FOX[A-Z]{4}$") + + # Replace matching channels with 'FOX' + temp2['Country'] = temp2['Country'].replace('US2','US') + temp2["Channel"] = temp2["Channel"].apply(lambda x: "FOX" if pattern.match(x) else x) + temp2 = temp2.drop_duplicates() + + + game_table = ( + GT(temp2.drop_duplicates()) + .tab_header( + title=f"{temp['Away Team'][0]} @ {temp['Home Team'][0]}", + subtitle=f"{temp['Time'][0]} {str(temp['Game Date'][0]).split(' ')[0]}", + ).tab_source_note(md(' ')) + ) + game_table_image_path = 'data/mlb.png' + game_table.save(game_table_image_path,window_size=(1000, 1000),web_driver=driver) + + with open(game_table_image_path, 'rb') as image_file: + # Prepare the payload with the file + payload = { + 'file': image_file + } + + # Send the POST request to the webhook + response = requests.post(mlb_sched_url, files=payload) + + + os.remove(game_table_image_path) + + + + + + + + + + + + + + diff --git a/src/ufc.py b/src/ufc.py new file mode 100644 index 0000000..2b0d129 --- /dev/null +++ b/src/ufc.py @@ -0,0 +1,294 @@ +from bs4 import BeautifulSoup +import pandas as pd +import re +from datetime import date +from lxml import etree +import pandas as pd +import re +from pathlib import Path +from datetime import datetime, date, time, timedelta, timezone +from zoneinfo import ZoneInfo # Python 3.9+ +from great_tables import GT, md +import os +mlb_sched_url = os.getenv("UFC_SCHED_URL") +import requests +import cloudscraper +driver = 'chrome' + +######### + +from bs4 import BeautifulSoup, Tag +import pandas as pd +import os +import re + +# --- load the HTML --- +# Load the HTML file +headers = { + 'Access-Control-Allow-Origin': '*', + 'Access-Control-Allow-Methods': 'GET', + 'Access-Control-Allow-Headers': 'Content-Type', + 'Access-Control-Max-Age': '3600', + 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0' + } + +url = f'https://www.mmafighting.com/schedule/ufc' +# Fetch the webpage content +scraper = cloudscraper.create_scraper() + +response = scraper.get(url) + +# Parse the HTML content using BeautifulSoup +soup = BeautifulSoup(response.content, 'html.parser') + +events, dates, card_types, fights = [], [], [], [] +main_starts, prelim_starts = [], [] + +def get_card_type(li: Tag) -> str: + """Find the nearest split section header (Main Card / Undercard).""" + split = li.find_parent("div", class_="m-mmaf-pte-event-list__split-item") + if split: + h4 = split.find("h4") + if h4: + return h4.get_text(strip=True) + return "Main Card" # default for top-level