The Following #Code currently extracts Data from the website with the following #SourceCode.
i would like you to adjust the #Code to scan data.csv to identify latest data then navigate the #SourceCode and proceed to update data.csv with the latest Data.
#SourceCode:
#Code:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select, WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
import csv
import time
import os
from bs4 import BeautifulSoup
import pandas as pd
# Set up Selenium webdriver
s = Service(ChromeDriverManager().install())
options = Options()
#options.add_argument("--headless") # Run in headless mode
options.add_experimental_option("prefs", {"profile.managed_default_content_settings.images": 2}) # this line disables images
driver = webdriver.Chrome(service=s, options=options)
# Open the website
driver.get('https://www.saudiexchange.sa/wps/portal/saudiexchange/newsandreports/reports-publications/historical-reports/!ut/p/z1/lY_NCsIwEISfpQ8gO0YTc41iU8HaxjRac5GcJKBVRHx-izfrD3VuC9_szJCnmnwT7vEQbvHchGN777zYcyXAMokCWTWDgNXCVXLI5mPavgIy1wJmpUzBJhx6A_J_-WFLDpOW-WiJNTREPz--SPXI9x3EONk2sOmUSzBYdIEPEzsf3jc8gR8lbbjS5eRcjbgYqCR5AIRk284!/dz/d5/L0lHSkovd0RNQUZrQUVnQSEhLzROVkUvZW4!/')
wait = WebDriverWait(driver, 10) # wait up to 10 seconds
# Define start and end dates YYYY-MM-DD
start_date = '2023-01-01'
end_date = '2023-05-01'
# Select market
test_test = driver.find_element_by_id('perfSummary_paginate')
market_dropdown = Select(wait.until(EC.presence_of_element_located((By.ID, 'marketOrIndices'))))
market_dropdown.select_by_value('MAIN')
# Specify date range
start_date_input = wait.until(EC.presence_of_element_located((By.ID, 'startTimePeriod')))
end_date_input = wait.until(EC.presence_of_element_located((By.ID, 'endTimePeriod')))
# Clear any existing input
driver.execute_script("arguments[0].value = '';", start_date_input)
driver.execute_script("arguments[0].value = '';", end_date_input)
# Input the dates
driver.execute_script("arguments[0].value = '{}';".format(start_date), start_date_input)
driver.execute_script("arguments[0].value = '{}';".format(end_date), end_date_input)
# Press enter to ensure the input is registered
end_date_input.send_keys(Keys.RETURN)
time.sleep(0.5) # wait for the page to load
last_page = last_sector = last_entity = None
# If "data.csv" is not empty, find the last page, sector and entity scraped
if os.path.exists('data.csv') and os.stat('data.csv').st_size > 0:
column_names = ["Page", "Sector", "Entity", "Date", "Open", "High", "Low", "Close", "Volume Traded", "Change", "Change %", "No. of Trades", "Value Traded", "Highest in 52 Weeks", "Lowest in 52 Weeks", "Earnings Per Share (EPS)", "Price to Earnings Ratio (P/E Ratio)", "Unnamed: 17", "Unnamed: 18"]
df = pd.read_csv('data.csv', names=column_names)
last_page = df.iloc[-1]['Page']
last_sector = df.iloc[-1]['Sector']
last_entity = df.iloc[-1]['Entity']
# Initialize the CSV file
with open('data.csv', 'a', newline='') as f:
writer = csv.writer(f)
if last_page is None and last_sector is None and last_entity is None:
# Scrape table headers if we're starting from scratch
table = wait.until(EC.presence_of_element_located((By.ID, 'perfSummary')))
headers = [th.text for th in table.find_elements(By.XPATH, './/th')]
writer.writerow(["Page", "Sector", "Entity"] + headers) # Write the headers only once
time.sleep(0.5) # wait for the page to load
sector_started = last_sector is None
entity_started = last_entity is None
# Loop through sectors
for sector_index in range(1, len(Select(wait.until(EC.presence_of_element_located((By.ID, 'sectors')))).options)):
sector = Select(wait.until(EC.presence_of_element_located((By.ID, 'sectors'))))
sector.select_by_index(sector_index)
time.sleep(0.5) # wait for the page to load
if sector.first_selected_option.text == last_sector:
sector_started = True
if not sector_started:
continue
# Loop through companies
print(f"Current Sector: {sector.first_selected_option.text}") # Print current sector
for entity_index in range(1, len(Select(wait.until(EC.presence_of_element_located((By.ID, 'entity')))).options)):
entity = Select(wait.until(EC.presence_of_element_located((By.ID, 'entity'))))
entity.select_by_index(entity_index)
time.sleep(0.5) # wait for the page to load
if entity.first_selected_option.text == last_entity:
entity_started = True
if not entity_started:
continue
print(f"Current Entity: {entity.first_selected_option.text}") # Print current entity
# Skip to page 26
page_number = 1 if last_page is None else last_page + 1
Skip_page_number = 0 if last_page is None else last_page
for _ in range(Skip_page_number): # Skip the first 25 pages
next_button = wait.until(EC.presence_of_element_located((By.ID, 'pageing_next')))
if 'disabled' in next_button.get_attribute('class'):
break
next_button.click()
time.sleep(1.5) # wait for the page to load
print(f"Current page: {page_number-1}") # Print current page
last_page = None
while True:
start_time = time.time() # Start timer
# Scrape table
def scrape_table():
table_html = driver.execute_script('return document.getElementById("perfSummary").innerHTML;')
soup = BeautifulSoup(table_html, 'lxml')
rows = soup.find_all('tr')
data_to_write = []
for row in rows:
cols = [ele.text.strip() for ele in row('td')]
if cols: # To make sure we are not appending an empty list
data_to_write.append([page_number, sector.first_selected_option.text, entity.first_selected_option.text] + cols)
writer.writerows(data_to_write)
scrape_table()
time.sleep(0.5) # wait for the page to load
# Go to next page
next_button = wait.until(EC.presence_of_element_located((By.ID, 'pageing_next')))
if 'disabled' in next_button.get_attribute('class'):
break
next_button.click()
time.sleep(2) # wait for the page to load
end_time = time.time() # End timer
print(f"Time elapsed for page {page_number}: {end_time - start_time} seconds") # Print time elapsed
print(f"Current page: {page_number}, Sector: {sector.first_selected_option.text}, Entity: {entity.first_selected_option.text}") # Print current page, sector, and entity
page_number += 1 # Increment page number
# Close the webdriver
driver.quit()