#data #kyc The International Consortium of Investigative Journalists (ICIJ) is a global network of journalists who collaborate to investigate and report on issues of public interest, particularly those related to financial secrecy, corruption, tax evasion, and illicit activities. Their data is quite helpful for KYC/OSINT, and much of it is structured for easy ingest into Neo4J. (I just use it for full text search but there is a lot more someone could do with this.) Link: Script: ```py # migrate_icij_data_table.py import sqlite3 import pandas as pd import os def create_icij_db(): # NOTE: the static data for this migration # is being stored separately from the repo # to save space. Before running this file, # you will need to drag and drop the ICIJ # CSV files into the 'static_data' folder. # This data does not update, so this can # be done on an as-needed basis # ------------------------------------ # # Delete the DB if it exists # ------------------------------------ # if os.path.exists("db_icij.db"): os.remove("db_icij.db") # ------------------------------------ # # Make the db # ------------------------------------ # db = sqlite3.connect('db_icij.db') cursor = db.cursor() cursor.close() db.close() # ------------------------------------ # # Make the dataframe # ------------------------------------ # address_df = pd.read_csv('static_data/nodes-addresses.csv') entities_df = pd.read_csv('static_data/nodes-entities.csv') intermediaries_df = pd.read_csv('static_data/nodes-intermediaries.csv') officers_df = pd.read_csv('static_data/nodes-officers.csv') others_df = pd.read_csv('static_data/nodes-others.csv') address_df = address_df.astype(str) entities_df = entities_df.astype(str) intermediaries_df = intermediaries_df.astype(str) officers_df = officers_df.astype(str) others_df = others_df.astype(str) icij_df = pd.concat([ address_df, entities_df, intermediaries_df, officers_df, others_df ]) # ------------------------------------ # # Insert data to db # ------------------------------------ # conn = sqlite3.connect('db_icij.db', check_same_thread=False) icij_df.to_sql(name='icij', con=conn, if_exists='append', index=False) # ------------------------------------ # # Make the virtual table # ------------------------------------ # conn.execute(''' CREATE VIRTUAL TABLE v_icij USING FTS5 (--_id, --node_id, name, original_name, former_name, address, countries, --country_codes, sourceID --valid_until, --note, --jurisdiction, --jurisdiction_description, --company_type, --internal_id, --incorporation_date, --inactivation_date, --struck_off_date, --dorm_date, --status, --service_provider, --ibcRUC, --type, --closed_date ) ; ''' ) conn.execute(''' INSERT INTO v_icij(--_id, --node_id, name, original_name, former_name, address, countries, --country_codes, sourceID --valid_until, --note, --jurisdiction, --jurisdiction_description, --company_type, --internal_id, --incorporation_date, --inactivation_date, --struck_off_date, --dorm_date, --status, --service_provider, --ibcRUC, --type, --closed_date ) SELECT --_id, --node_id, name, original_name, former_name, address, countries, --country_codes, sourceID --valid_until, --note, --jurisdiction, --jurisdiction_description, --company_type, --internal_id, --incorporation_date, --inactivation_date, --struck_off_date, --dorm_date, --status, --service_provider, --ibcRUC, --type, --closed_date FROM icij ; ''' ) conn.commit() conn.execute('''DROP TABLE icij;''') conn.execute('''VACUUM;''') # ------------------------------------ # # Close the database # ------------------------------------ # conn.close() # ------------------------------------ # # print statement # ------------------------------------ # print(' ') print(len(icij_df)) print(' ') print('--------------------------------------------') print('--------------------------------------------') print(' ICIJ Database Build Complete ') print('--------------------------------------------') print('--------------------------------------------') create_icij_db() ```