#data #bizdev_utils #kyc #_2023 PE Hub is an online publication focused on the private equity (PE) and venture capital (VC) industries. It serves as a resource for professionals involved in private markets, providing news, analysis, and insights about private equity deals, venture capital funding, mergers and acquisitions (M&A), and other related activities. Like Channel E2E, the site titles of PE Hub can be helpful for KYC and BizDev efforts, as they are a quick lookup of M&A activity (all I care about is if a company has received prior investment or not, so if its on this list then I'm disinterested). Link: Script: ```shell #!/bin/bash # curl meta sitemap curl https://www.pehub.com/sitemap_index.xml > a_meta_sitemap.txt # print statement to create space on the command line echo " " # grep just the lines we want grep "https://www.pehub.com/post-sitemap" a_meta_sitemap.txt > b_meta_sitemap.txt # get rid of loc tags sed sed -i 's/<\/\?loc>//g' b_meta_sitemap.txt # get rid of leading whitespaces with sed sed -i 's/^[[:blank:]]*//; s/[[:blank:]]*$//' b_meta_sitemap.txt # make a file variable file="b_meta_sitemap.txt" # loop through meta sitemap and curl each sitemap within it while IFS= read -r line do # print statements echo "$line" # curl the endpoint curl "$line" > pe_sitemap.txt # return just the useful lines grep "https://" pe_sitemap.txt > a_pe_links.txt # get rid of leading useless text sed -i 's#<loc>https://www.pehub.com/##' a_pe_links.txt # replace - with space sed -i 's/-/ /g' a_pe_links.txt # replace useless end text with , and make csv sed 's/\/<\/loc>/,/g' a_pe_links.txt > b_pe_links.csv # get rid of additional tags grep -v '<' b_pe_links.csv > c_pe_links.csv # append results to csv sed 's/^[[:space:]]*//' c_pe_links.csv >> final_pe_links.csv # additional print for space when reading echo " " done < "$file" # delete the temp files (optional) rm a_meta_sitemap.txt rm a_pe_links.txt rm b_meta_sitemap.txt rm b_pe_links.csv rm c_pe_links.csv rm pe_sitemap.txt ```