SEC Actions - Benjamin Gaines

#data #_2022 #kyc #OSINT SEC actions refer to enforcement measures and legal proceedings initiated by the U.S. Securities and Exchange Commission (SEC), the federal agency responsible for regulating securities markets and protecting investors. These actions are typically taken against individuals, companies, or entities that violate U.S. securities laws, such as fraud, insider trading, accounting irregularities, or failure to comply with reporting requirements. This is a great source of data for KYC / OSINT. Again, I set it up in a full text search SQLite database for such purposes. Link: Script: ```py # migrate_sec_actions.py from bs4 import BeautifulSoup import requests from bs4.element import Comment import sqlite3 import pandas as pd import requests import os import json import time import re def create_sec_db(): # ------------------------------------ # # Delete the DB if it exists # ------------------------------------ # if os.path.exists("db_sec.db"): os.remove("db_sec.db") # ------------------------------------ # # Make the db # ------------------------------------ # db = sqlite3.connect('db_sec.db') cursor = db.cursor() cursor.close() db.close() # ------------------------------------ # # Make and insert the dataframe # ------------------------------------ # # make the list of all character combos for the url list_0 = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z'] list_1 = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z'] char_list = [] for char_0 in list_0: for char_1 in list_1: char_set = char_0+char_1 char_list.append(char_set) # query each character set and continue while pagination is true for char_set in char_list: page=0 pagination=True while pagination: # set the url print(f'{char_set} - {page}') url = f'https://www.sec.gov/litigations/sec-action-look-up?aId=&last_name={char_set}&first_name=&items_per_page=50&page&page={page}' response = requests.get(url) html = response.content soup = BeautifulSoup(html, 'html.parser') # make lists for later dataframe name_list = [] alt_name_list = [] age_list = [] enforcement_list = [] date_filed_list = [] # get list of all cards card_list = soup.find_all("div", {"class": re.compile("^card border-divide views-row view-row-count")}) for card in card_list: # add the name to the name list name_element = card.find("h2", {"class": "field-content card-title"}) if name_element is not None: name = name_element.text name_list.append(name) else: name_list.append('n/a') # add the alt name to the list alt_name_elem = card.find("div", {"class": "views-field views-field-field-also-known-as-1"}) if alt_name_elem is not None: alt_name_span = alt_name_elem.find("span", {"class": "field-content"}) alt_name_list.append(alt_name_span.text) else: alt_name_list.append('n/a') # add current age to the list age_elem = card.find("div", {"class": "views-field views-field-field-age-in-document"}) if age_elem is not None: age_span = age_elem.find("span", {"class": "field-content"}) age_list.append(age_span.text) else: age_list.append('n/a') # add enforcement list enforcement_elem = card.find("div", {"class": "views-field views-field-field-action-name-in-document"}) if enforcement_elem is not None: enforcement_span = enforcement_elem.find("span", {"class": "field-content"}) enforcement_list.append(enforcement_span.text.replace('\t','')) else: enforcement_list.append('n/a') # add date filed list date_filed_elem = card.find("div", {"class": "views-field views-field-field-date-filed"}) if date_filed_elem is not None: date_filed_span = date_filed_elem.find("span", {"class": "field-content"}) date_filed_list.append(date_filed_span.text.replace('\n','')) else: date_filed_list.append('n/a') # make a dataframe from the lists zipped = list(zip(name_list, alt_name_list, age_list, enforcement_list, date_filed_list)) df = df = pd.DataFrame(zipped, columns=['name', 'alt_name', 'age', 'enforcement_actions', 'date_filed']) # print check (optional) print(df) # sleep (optional) time.sleep(2.5) if len(df)==50: page+=1 else: pagination=False # ------------------------------------ # # Insert data to db # ------------------------------------ # conn = sqlite3.connect('db_sec.db', check_same_thread=False) df.to_sql(name='sec', con=conn, if_exists='append', index=False) # ------------------------------------ # # Make the virtual table # ------------------------------------ # conn.execute(''' CREATE VIRTUAL TABLE v_sec USING FTS5 ( name, alt_name, age, enforcement_actions, date_filed ) ; ''' ) conn.execute(''' INSERT INTO v_sec ( name, alt_name, age, enforcement_actions, date_filed ) SELECT DISTINCT name, alt_name, age, enforcement_actions, date_filed FROM sec ; ''' ) conn.commit() conn.execute('''DROP TABLE sec;''') conn.execute('''VACUUM;''') # ------------------------------------ # # Close the database # ------------------------------------ # conn.close() # ------------------------------------ # # print statement # ------------------------------------ # print('--------------------------------------------') print('--------------------------------------------') print(' SEC Actions Database Build Complete ') print('--------------------------------------------') print('--------------------------------------------') # create_sec_db() ```