#data #_2022 #OSINT
The OCC Actions List refers to a record maintained by the Office of the Comptroller of the Currency (OCC), which documents formal enforcement actions and administrative proceedings against national banks, federal savings associations, and their employees, officers, or directors.
This is one list I consult when doing KYC checks for individuals affiliated to a deal. Accordingly, I like to prep it for full text search by using SQLite.
Link:
Script:
```py
# get_occ_actions.py
# ------------------------------------ #
# Notes and resources for occ scraping
# ------------------------------------ #
# occ robots.txt
# https://occ.gov/robots.txt
# Only Applebot is disallowed from crawling.
# Always check the robots.txt file for
# updates before runnign this script to
# be sure their terms haven't changed
# ------------------------------------ #
# Imports
# ------------------------------------ #
from bs4 import BeautifulSoup
import requests
from bs4.element import Comment
import sqlite3
import pandas as pd
import requests
import os
import json
import time
import re
def create_occ_db():
# ------------------------------------ #
# Delete the DB if it exists
# ------------------------------------ #
if os.path.exists("db_occ.db"):
os.remove("db_occ.db")
# ------------------------------------ #
# Make the db
# ------------------------------------ #
db = sqlite3.connect('db_occ.db')
cursor = db.cursor()
cursor.close()
db.close()
# ------------------------------------ #
# Make and insert the dataframe
# ------------------------------------ #
# make the list of all character combos for the url
list_0 = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z']
list_1 = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z']
char_list = []
for char_0 in list_0:
for char_1 in list_1:
char_set = char_0+char_1
char_list.append(char_set)
# query each character set and continue while pagination is true
for char_set in char_list:
try:
# ------------------------------------ #
# Make the DataFrame
# ------------------------------------ #
# scrape the site
url = f'https://apps.occ.gov/EASearch/?Search={char_set}&Category=&ItemsPerPage=1000&Sort=&AutoCompleteSelection='
df_list = pd.read_html(url)
df = df_list[0]
df.columns = df.columns.droplevel(0)
# rename the column titles to remove spaces and camelcase
column_titles = [
'institution_bank_name',
'charter_number',
'company',
'individual',
'city_and_state',
'type',
'amount',
'date_start',
'start_doc',
'date_close',
'close_doc',
'docket_number'
]
df.columns = column_titles
# print check
print(char_set)
print(' ')
print(df.head())
print(' ')
# ------------------------------------ #
# Insert data to db
# ------------------------------------ #
conn = sqlite3.connect('db_occ.db', check_same_thread=False)
df.to_sql(name='occ', con=conn, if_exists='append', index=False)
except:
print(f'No data for: {char_set}')
continue
# ------------------------------------ #
# Make the virtual table
# ------------------------------------ #
headers = ''
for title in column_titles:
headers += title
headers += ','
conn.execute(f'''
CREATE VIRTUAL TABLE v_occ
USING FTS5 (
{headers[:-1]}
)
;
'''
)
conn.execute(f'''
INSERT INTO v_occ (
{headers[:-1]}
)
SELECT DISTINCT {headers[:-1]}
FROM occ
;
'''
)
conn.commit()
conn.execute('''DROP TABLE occ;''')
conn.execute('''VACUUM;''')
# ------------------------------------ #
# Close the database
# ------------------------------------ #
conn.close()
# ------------------------------------ #
# print statement
# ------------------------------------ #
print(' ')
print(len(df))
print(' ')
print('--------------------------------------------')
print('--------------------------------------------')
print(' OCC Actions Database Build Complete ')
print('--------------------------------------------')
print('--------------------------------------------')
# ------------------------------------ #
# Execute the function
# ------------------------------------ #
create_occ_db()
```