Scrap NAICS codes and save them to sqlite database

The task is to get all NAICS codes with titles from and save them to sqlite database.

I'll use Beautifu soup for scraping. See also:

Install requirements:

pip install beautifulsoup4


import os.path
import sqlite3
import urllib2

from bs4 import BeautifulSoup


def parse_page(cur, url):
    f = urllib2.urlopen(url)
    soup = BeautifulSoup(
    table = soup.find('table')
    for tr in table.find_all('tr'):
        tds = tr.find_all('td')
        if tds:
            code = tds[0].get_text()
            if len(code) != 6:
            title = tds[1].get_text()
            print code, title
                INSERT INTO Codes(Code, Title) VALUES({code}, "{title}");
            """.format(code=code, title=title))

def main():
    # init db
    db_path = os.path.join(
        os.path.dirname(os.path.realpath(__file__)), 'naics.sqlite3')
    con = sqlite3.connect(db_path)
    with con:
        cur = con.cursor()
            DROP TABLE IF EXISTS Codes;
            CREATE TABLE Codes(Code INT PRIMARY KEY, Title TEXT);
        # scrap
        f = urllib2.urlopen(TARGET_URL)
        soup = BeautifulSoup(
        n = 0
        table = soup.find(
            text='NAICS CODE DRILL DOWN TABLE').next_sibling.find('table')
        for tr in table.find_all('tr'):
            tds = tr.find_all('td')
            if tds:
                parse_page(cur=cur, url=tds[0].find('a').get('href'))

if __name__ == '__main__':


sqlite3 naics.sqlite3
sqlite> select * from Codes where code like '111%';
111110|Soybean Farming
111120|Oilseed (except Soybean) Farming
111940|Hay Farming
111991|Sugar Beet Farming
111992|Peanut Farming
111998|All Other Miscellaneous Crop Farming
sqlite> .q
Licensed under CC BY-SA 3.0