Scrap NAICS codes and save them to sqlite database
The task is to get all NAICS codes with titles from http://www.naics.com/search/ and save them to sqlite database.
I'll use Beautifu soup for scraping. See also:
Install requirements:
pip install beautifulsoup4
Code:
import os.path import sqlite3 import urllib2 from bs4 import BeautifulSoup TARGET_URL = 'http://www.naics.com/search/' def parse_page(cur, url): f = urllib2.urlopen(url) soup = BeautifulSoup(f.read()) table = soup.find('table') for tr in table.find_all('tr'): tds = tr.find_all('td') if tds: code = tds[0].get_text() if len(code) != 6: continue title = tds[1].get_text() print code, title cur.executescript(u""" INSERT INTO Codes(Code, Title) VALUES({code}, "{title}"); """.format(code=code, title=title)) def main(): # init db db_path = os.path.join( os.path.dirname(os.path.realpath(__file__)), 'naics.sqlite3') con = sqlite3.connect(db_path) with con: cur = con.cursor() cur.executescript(""" DROP TABLE IF EXISTS Codes; CREATE TABLE Codes(Code INT PRIMARY KEY, Title TEXT); """) # scrap f = urllib2.urlopen(TARGET_URL) soup = BeautifulSoup(f.read()) n = 0 table = soup.find( 'h4', text='NAICS CODE DRILL DOWN TABLE').next_sibling.find('table') for tr in table.find_all('tr'): tds = tr.find_all('td') if tds: parse_page(cur=cur, url=tds[0].find('a').get('href')) if __name__ == '__main__': main()
Result:
sqlite3 naics.sqlite3 sqlite> select * from Codes where code like '111%'; 111110|Soybean Farming 111120|Oilseed (except Soybean) Farming ... 111940|Hay Farming 111991|Sugar Beet Farming 111992|Peanut Farming 111998|All Other Miscellaneous Crop Farming sqlite> .q
Licensed under CC BY-SA 3.0