fren sent the most basic equity swap scraper i've seen.
need to make sure the modules at the top are installed.
i cant help you with this besides the gift. not my role here.
MUST DO:
1 change the dates in the script to be TODAY to an end date of TODAY - 2 years. swap reports go back 2 years.
2 SET THE DAMN DOWNLOAD PATH TOO.
3 ???
4 profit.
i give and take information freely with no regard or responsibility of any others actions. although scraping is legal in the USA please consult your local laws before
#AllYourBaseAreBelongToUs.
thank you and
#wednesdaytheplanet
|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
import pandas as pd
import glob
import requests
import os
from zipfile import ZipFile
from datetime import datetime, timedelta
from concurrent.futures import ThreadPoolExecutor
# Define output path
output_path = r"D:\Python\Swap" # path to folder where you want filtered reports to save
# Function to generate URLs based on a date range
def generate_urls(start_date, end_date):
url_list = []
current_date = start_date
base_url = "
pddata.dtcc.com/ppd/api/repo…"
while current_date <= end_date:
# Format the date as YYYY_MM_DD
date_str = current_date.strftime('%Y_%m_%d')
# Append the formatted URL
url_list.append(f"{base_url}{date_str}.zip")
# Increment the date by 1 day
current_date = timedelta(days=1)
return url_list
# Define the start and end dates for the URL generation
start_date = datetime(2024, 10, 2)
end_date = datetime(2024, 10, 28)
# Generate the URLs
urls = generate_urls(start_date, end_date)
# Download and process a file
def download_and_process(url):
try:
# Download file
req = requests.get(url)
zip_filename = url.split('/')[-1]
with open(zip_filename, 'wb') as f:
f.write(req.content)
# Extract CSV from zip
with ZipFile(zip_filename, 'r') as zip_ref:
csv_filename = zip_ref.namelist()[0]
zip_ref.extractall()
# Load content into dataframe
df =
pd.read_csv(csv_filename, low_memory=False)
# Perform filtering based on specific columns
if 'Primary Asset Class' in df.columns:
df = df[df["Underlying Asset ID"].str.contains('GME.N|
GME.AX|US36467W1099|36467W109', na=False)]
elif 'Action Type' in df.columns:
df = df[df["Underlying Asset ID"].str.contains('GME.N|
GME.AX|US36467W1099|36467W109', na=False)]
else:
df = df[df["Underlier ID-Leg 1"].str.contains('GME.N|
GME.AX|US36467W1099|36467W109', na=False)]
# Vectorized update of Action type values
'''df['Action type'] = df['Action type'].fillna(False).replace({
'CORRECT': 'CORR',
'CANCEL': 'TERM',
'NEW': 'NEWT'
})'''
# Save the filtered dataframe as a CSV
output_filename = os.path.join(output_path, csv_filename)
df.to_csv(output_filename, index=False)
print(str(output_filename))
# Clean up
os.remove(zip_filename)
os.remove(csv_filename)
except Exception as e:
print(f"An error occurred for {url}: {e}")
# Parallel download and processing
with ThreadPoolExecutor(max_workers=2) as executor: # Adjust max_workers based on system capabilities
executor.map(download_and_process, urls)
# Function to merge all CSV files into one master dataframe
def filter_merge():
master = pd.DataFrame() # Start with an empty dataframe
# Use glob to find all CSVs in the output path
files = glob.glob(output_path '\\*.csv')
for file in files:
try:
# Use chunksize to process CSV in smaller chunks
chunks =
pd.read_csv(file, chunksize=100000, low_memory=False, dtype=str)
print(str(file))
for chunk in chunks:
# Ensure numeric conversion for relevant columns
#chunk['Notional amount-Leg 1'] =
pd.to_numeric(chunk['Notional amount-Leg 1'], errors='coerce')
#chunk['Notional amount-Leg 2'] =
pd.to_numeric(chunk['Notional amount-Leg 2'], errors='coerce')
#chunk['Price'] =
pd.to_numeric(chunk['Price'], errors='coerce')
# Calculate total price for each leg
#chunk['Total price-Leg 1'] = chunk['Notional amount-Leg 1'] * chunk['Price']
#chunk['Total price-Leg 2'] = chunk['Notional amount-Leg 2'] * chunk['Price']
# Calculate the overall total price (sum of both legs)
#chunk['Total price'] = chunk['Total price-Leg 1'] chunk['Total price-Leg 2']
# Concatenate chunk to the master DataFrame
master = pd.concat([master, chunk], ignore_index=True)
except ValueError as ve:
print(f"Error reading file {file}: {ve}")
except MemoryError:
print(f"Skipping file {file} due to memory allocation error.")
return master
# Merge and process the files
master = filter_merge()
# Drop unnecessary columns if present
if 'Unnamed: 0' in master.columns:
master = master.drop(columns=['Unnamed: 0'])
# Save the final merged dataframe to a new CSV
master.to_csv(r"D:\Python\Swap\filtered_with_price.csv", index=False)
print("Merging and calculation completed.")