#kyc #pe_utils #data #_2022
The Department of Justice (DOJ) press releases are a great source of data for KYC and diligence.
Link:
Script:
```py
# migrate_doj_press_release_table.py
import sqlite3
import pandas as pd
import requests
import time
import os
import json
import glob
from bs4 import BeautifulSoup
from bs4.element import Comment
import re
import os
# ------------------------------------ #
# Make the db
# ------------------------------------ #
def make_doj_db():
if os.path.exists("db_doj.db"):
os.remove("db_doj.db")
db = sqlite3.connect('db_doj.db')
cursor = db.cursor()
cursor.close()
db.close()
# ------------------------------------ #
# Get the max page numbers
# ------------------------------------ #
def get_num_of_pages():
# get the data from the first page
page_list = []
url = f'https://www.justice.gov/news'
response = requests.get(url)
html = response.content
soup = BeautifulSoup(html, 'html.parser')
tags = soup.findAll('a')
# find the element with the final page number
for link in soup.find_all('a', attrs={'title': re.compile("^Go to last page")}):
page_list.append(link.get('href'))
# get the page number of the last page
next_page_url = page_list[0]
page_num = next_page_url.split("=")[1]
return page_num
# ------------------------------------ #
# Get the list of all doj pages
# ------------------------------------ #
def update_doj_page_list():
final_page = get_num_of_pages()
doj_page_list = []
final_doj_page_list = []
cur_page=0
while cur_page<=int(final_page):
# more_page_list = []
url = f'https://www.justice.gov/news?page={cur_page}'
response = requests.get(url)
html = response.content
soup = BeautifulSoup(html, 'html.parser')
tags = soup.findAll('a')
# get the links
for link in soup.find_all('a',
attrs={'href': re.compile("^/opa/")}):
doj_page_list.append("https://www.justice.gov" + link.get('href'))
# iterate through pages
cur_page+=1
time.sleep(1)
print(f'Link Page: {cur_page}')
# get rid of the galleries links
for page in doj_page_list:
if page == 'https://www.justice.gov/opa/galleries':
continue
else:
final_doj_page_list.append(page)
return final_doj_page_list
# ------------------------------------ #
# Scrape all the pages and insert to DB
# ------------------------------------ #
def scrape_doj_releases(list_of_doj_urls):
for doj_url in list_of_doj_urls:
url = doj_url
response = requests.get(url)
html = response.content
soup = BeautifulSoup(html, 'html.parser')
# get the url
page_url = url
# get the date
try:
date=soup.find("span", {"class": "date-display-single"}).text
except:
date='n/a'
# get the title
try:
title=soup.find(id='node-title').text
except:
title='n/a'
# get the title
try:
subtitle=soup.find(id='node-subtitle').text
except:
subtitle='n/a'
# get the page text
try:
page_text = soup.find("div", {"class": "field field--name-field-pr-body field--type-text-long field--label-hidden"}).text
except:
page_text = 'n/a'
# press_release_number
try:
press_release_number_tag = soup.find("div", {"class": "field field--name-field-pr-number field--type-text field--label-above"})
press_release_number = press_release_number_tag.text.split(":")[-1]
except:
press_release_number = 'n/a'
# make the dataframe
doj_dict = {
'url':[url],
'date':[date],
'title':[title],
'subtitle':[subtitle],
'page_text':[page_text],
'press_release_number':[press_release_number]
}
doj_df = df = pd.DataFrame(doj_dict)
# add the data to the db
conn = sqlite3.connect('db_doj.db', check_same_thread=False)
doj_df.to_sql(name='doj', con=conn, if_exists='append', index=False)
conn.close()
# ------------------------------------ #
# Make the virtual table
# ------------------------------------ #
def make_doj_virtual_table():
conn = sqlite3.connect('db_doj.db', check_same_thread=False)
conn.execute('''
CREATE VIRTUAL TABLE v_doj
USING FTS5 (
url,
date,
title,
subtitle,
page_text,
press_release_number
)
;
'''
)
conn.execute('''
INSERT INTO v_doj (
url,
date,
title,
subtitle,
page_text,
press_release_number
)
SELECT url,
date,
title,
subtitle,
page_text,
press_release_number
FROM doj
WHERE date LIKE '%2022%'
OR date LIKE '%2021%'
OR date LIKE '%2020%'
OR date LIKE '%2019%'
OR date LIKE '%2018%'
OR date LIKE '%2017%'
;
'''
)
conn.commit()
conn.execute('''DROP TABLE doj;''')
conn.execute('''VACUUM;''')
# ------------------------------------ #
# print statement
# ------------------------------------ #
print('--------------------------------------------')
print('--------------------------------------------')
print(' DOJ Press Release Database Build Complete ')
print('--------------------------------------------')
print('--------------------------------------------')
# ------------------------------------ #
# Run all functions
# ------------------------------------ #
def create_doj_db():
make_doj_db()
test_list = update_doj_page_list()
scrape_doj_releases(test_list)
make_doj_virtual_table()
```