Created in June 2024 PDF: paulgalea.com/Projects/Sprinting_Swimming_Jumping_Record_Progression/Visualisation.pdf /* ..................... */ /* ........TOOLS........ */ /* ..................... */ Python Adobe Illustrator /* ..................... */ /* ....PYTHON SCRIPT.... */ /* ..................... */ from datetime import datetime, timedelta from bs4 import BeautifulSoup import requests as r from selenium import webdriver from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.common.by import By import time from io import StringIO import pandas as pd import re # Source (Athletics) athletics_rows, athletics_columns = [], ['Event','Mark','Competitor','Date'] athletics_urls = ['https://worldathletics.org/records/all-time-toplists/sprints/100-metres/all/women/senior', 'https://worldathletics.org/records/all-time-toplists/sprints/100-metres/all/men/senior', 'https://worldathletics.org/records/all-time-toplists/sprints/200-metres/all/women/senior', 'https://worldathletics.org/records/all-time-toplists/sprints/200-metres/all/men/senior', 'https://worldathletics.org/records/all-time-toplists/jumps/long-jump/all/women/senior', 'https://worldathletics.org/records/all-time-toplists/jumps/long-jump/all/men/senior', 'https://worldathletics.org/records/all-time-toplists/jumps/high-jump/all/women/senior', 'https://worldathletics.org/records/all-time-toplists/jumps/high-jump/all/men/senior'] # Source (Aquatics) aquatics_data, aquatics_columns = [], [] aquatics_sex, aquatics_event, aquatics_year = ['M','F'], [100,200], list(range(1900,datetime.now().year+1)) aquatics_base_url = ("https://www.worldaquatics.com/swimming/rankings?gender={sex}&distance={distance}" "&stroke=FREESTYLE&poolConfiguration=LCM&year={year}×Mode=ALL_TIMES®ionId=all") aquatics_urls = [aquatics_base_url.format(sex=s, distance=e, year=y) for s in aquatics_sex for e in aquatics_event for y in aquatics_year] # Scrape (Athletics) for url in athletics_urls: page = 1 while True: params = {'timing':'electronic', 'windReading':'regular', 'firstDay':'1900-01-01', 'page':page, 'lastDay':(datetime.now()-timedelta(days=2)).strftime("%Y-%m-%d"), 'regionType':'world', 'maxResultsByCountry':'all', 'bestResultsOnly':'false'} soup = BeautifulSoup(r.get(url, params=params).text, 'html.parser') table = soup.find('table', {'class':'records-table'}) if not table: break headers = [header.text.strip() for header in table.find_all('th')] indices = [headers.index(col) for col in athletics_columns if col in headers] for row in table.find_all('tr'): if row.find_all('td'): row_data = [row.find_all('td')[index].text.strip() for index in indices] row_data.insert(0, '/'.join(url.split('/')[5:7])) athletics_rows.append(row_data) page += 1 # Scrape (Aquatics) driver = webdriver.Chrome() for url in aquatics_urls: driver.get(url) wait = WebDriverWait(driver, 20) try: cookie_notice = wait.until(EC.element_to_be_clickable((By.ID, 'js-cookie-notice'))) cookie_notice_close_button = cookie_notice.find_element(By.TAG_NAME, 'button') cookie_notice_close_button.click() except Exception: pass while True: try: button = wait.until(EC.element_to_be_clickable( (By.CLASS_NAME,'button.load-more-button.js-show-more-button'))) driver.execute_script("arguments[0].scrollIntoView(true);", button) button.click() time.sleep(5) except Exception: break previous_row_count = 0 while True: driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") time.sleep(5) current_row_count = len(driver.find_elements(By.CLASS_NAME, 'rankings-table__row')) if current_row_count == previous_row_count: break previous_row_count = current_row_count try: table = driver.find_element(By.CLASS_NAME, 'rankings-table__table') if not aquatics_columns: aquatics_columns = pd.read_html(StringIO(table.get_attribute('outerHTML')))[0].columns.tolist() rows = driver.find_elements(By.CLASS_NAME, 'rankings-table__row') for row in rows: row_data = [cell.text for cell in row.find_elements(By.TAG_NAME, 'td')] row_data.append(url.split('&distance=')[1].split('&stroke=')[0]) aquatics_data.append(row_data) except Exception: pass driver.quit() # Data (Athletics) athletics_df = pd.DataFrame(athletics_rows, columns=athletics_columns) athletics_df['Date'] = pd.to_datetime(athletics_df['Date'], format='%d %b %Y') athletics_df['Mark'] = pd.to_numeric(athletics_df['Mark']) athletics_df = athletics_df[['Event','Mark','Competitor','Date']] athletics_df.to_csv(r"DATA LOCATION") # Data (Aquatics) def time_to_seconds(time): time = re.sub(r'[^\d:.]', '', time) if ':' in time: minutes, seconds = time.split(':') seconds = int(minutes)*60 + float(seconds) else: seconds = float(time) return seconds aquatics_df = pd.DataFrame(aquatics_data, columns=aquatics_columns+['Event']) aquatics_df['Competitor'] = aquatics_df['Name'].str.replace('\n', ' ', regex=False) aquatics_df['Event'] = aquatics_df['Event'].apply(lambda x: f"swimming/{x}-metres") aquatics_df = pd.concat([aquatics_df, pd.DataFrame({'Event':['swimming/200-metres'], 'Time':['1:46.67'], 'Competitor':['Grant HACKETT'], 'Date':['23 Mar 1999']})], ignore_index=True) aquatics_df['Date'] = pd.to_datetime(aquatics_df['Date'], format='%d %b %Y') aquatics_df['Mark'] = aquatics_df['Time'].apply(time_to_seconds) aquatics_df = aquatics_df[~((aquatics_df['Event']=='swimming/100-metres') & (aquatics_df['Mark']==46.94) & (aquatics_df['Competitor']=='Alain BERNARD') & (aquatics_df['Date']=='2009-04-23'))] aquatics_df = aquatics_df[['Event','Mark','Competitor','Date']] aquatics_df.to_csv(r"DATA LOCATION") # Records def best(df, place): changes = [] for event, group in df.groupby('Event'): current_best, sort_ascending = None, True if event in ['jumps/long-jump','jumps/high-jump']: sort_ascending = False for date in group['Date'].unique(): df_filtered = group[group['Date'] <= date] df_filtered = df_filtered.sort_values(by='Mark',ascending=sort_ascending).drop_duplicates( subset=['Competitor'], keep='first') if len(df_filtered) < place: continue best = df_filtered.sort_values(by='Mark', ascending=sort_ascending).iloc[place-1]['Mark'] if best != current_best: changes.append((date, best, event)) current_best = best return changes df = pd.concat([aquatics_df, athletics_df]).sort_values('Date') first = pd.DataFrame(best(df,1), columns=['Date','Mark','Event']) first.to_csv(r"DATA LOCATION") second = pd.DataFrame(best(df,2), columns=['Date','Mark','Event']) second.to_csv(r"DATA LOCATION") third = pd.DataFrame(best(df,3), columns=['Date','Mark','Event']) third.to_csv(r"DATA LOCATION") /* ..................... */ /* .......SOURCES....... */ /* ..................... */ Viewed online June 2024: - ncbi.nlm.nih.gov/pmc/articles/PMC4928019 - enhanced.org/science-is-real - worldathletics.org/records/all-time-toplists - worldaquatics.com/swimming/rankings > 2009 100m record with unapproved swimsuit removed: telegraph.co.uk/sport/olympics/swimming/5604748/Alain-Bernard-stripped-of-100m-freestyle-world-record-because-of-swimsuit.html > Missing 1999 200m record added: archive.org/details/swimnewsn252/page/5/mode/1up?view=theater