Robust HTML table parser code for tables with rowspan and/or colspans

In scouring the web, it was to the embarrasment of humanity that I could not find a parser that could accurately parse the tables found on Wikipedia such as Wisconsin_political_power; I thus wrote my own.

The issue is that parsers for Wikipedia tables could not handle rowspan and colspan attributes in tables so when these parsers faced such obstacles they either errered out or returned jarmbled tables and entries. Because I needed data on a Wikipedia article in which the data tables had these attributes, I had to somehow figure out how to parse such tables. If you encounter a similar issue, I hope this saves you time.

Below, I include both the functions and a few examples. If you wish to just see the examples, you can scroll past the Parser Functions

Parser functions

The functions below parse HTML tables (BeautifulSoup objects). I also include a convenience function for fetching the HTML of the website though for details, I suggest looking through the examples.

##Libraries used are:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup as bs
import requests

def pre_process_table(table):
    """
    INPUT:
        1. table - a bs4 element that contains the desired table: ie <table> ... </table>
    OUTPUT:
        a tuple of: 
            1. rows - a list of table rows ie: list of <tr>...</tr> elements
            2. num_rows - number of rows in the table
            3. num_cols - number of columns in the table
    """
    rows = [x for x in table.find_all('tr')]

    num_rows = len(rows)
    
    ## get table's column count. Most of often, this will be accurate
    num_cols = max([len(x.find_all(['th','td'])) for x in rows])

    ## ...on occasion, the above fails because some columns have colspan attributes too. The below accounts for that:
    header_rows_set = [x.find_all(['th', 'td']) for x in rows if len(x.find_all(['th', 'td']))>num_cols/2]
    
    num_cols_set = []

    for header_rows in header_rows_set:
        num_cols = 0
        for cell in header_rows:
            row_span, col_span = get_spans(cell)
            num_cols+=len([cell.getText()]*col_span)
            
        num_cols_set.append(num_cols)
    
    num_cols = max(num_cols_set)
    #print(num_cols)
    
    return (rows, num_rows, num_cols)


def get_spans(cell):
        """
        INPUT:
            1. cell - a <td>...</td> or <th>...</th> element that contains a table cell entry
        OUTPUT:
            1. a tuple with the cell's row and col spans
        """
        if cell.has_attr('rowspan'):
            rep_row = int(cell.attrs['rowspan'])
        else: # ~cell.has_attr('rowspan'):
            rep_row = 1
        if cell.has_attr('colspan'):
            rep_col = int(cell.attrs['colspan'])
        else: # ~cell.has_attr('colspan'):
            rep_col = 1 
        
        return (rep_row, rep_col)
 
def process_rows(rows, num_rows, num_cols):
    """
    INPUT:
        1. rows - a list of table rows ie <tr>...</tr> elements
    OUTPUT:
        1. data - a Pandas dataframe with the html data in it
    """
    data = pd.DataFrame(np.ones((num_rows, num_cols))*np.nan)
    for i, row in enumerate(rows):
        col_stat = data.iloc[i,:][data.iloc[i,:].isnull()].index[0]

        for j, cell in enumerate(row.find_all(['td', 'th'])):
            rep_row, rep_col = get_spans(cell)

            #print("cols {0} to {1} with rep_col={2}".format(col_stat, col_stat+rep_col, rep_col))
            #print("\trows {0} to {1} with rep_row={2}".format(i, i+rep_row, rep_row))

            #find first non-na col and fill that one
            while any(data.iloc[i,col_stat:col_stat+rep_col].notnull()):
                col_stat+=1

            data.iloc[i:i+rep_row,col_stat:col_stat+rep_col] = cell.getText()
            if col_stat<data.shape[1]-1:
                col_stat+=rep_col

    return data

def fetch_website(url):
    ## apply a more friendly user agent
    user_agent={'User-agent':'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.18 Safari/537.36'}
    r=requests.get(url, headers=user_agent)
    try:
        #print("Accessed and downloaded URL data")
        return(r.content)
    except ConnectionError:
        print("Skipping this url")
        return(None)    
    
def main(table):
    """
    DESCRIPTION:
        Parse one table and return a Pandas DataFrame
    """
    import pandas as pd
    import numpy as np
    from bs4 import BeautifulSoup as bs
    
    rows, num_rows, num_cols = pre_process_table(table)
    df = process_rows(rows, num_rows, num_cols)
    return(df)

Example - the essence of the game

s = """
<table>
    <tr>
        <th rowspan="2">year</th>
        <th>col1</th>
        <th colspan="2">col2</th>
    </tr>
    <tr>
        <td rowspan="2">aaa</td>
        <td rowspan="2">bbb</td>
        <td rowspan="1">ccc</td>
    </tr>
    <tr>
        <td>2017</td>
        <td rowspan="2">ddd</td>
    </tr>
    <tr>
        <td>2018</td>
        <td>col1</td>
        <td colspan="1">col2</td>
    </tr>
</table>
"""
table = bs(s, 'lxml').find('table')
rows, num_rows, num_cols = pre_process_table(table)
df = process_rows(rows, num_rows, num_cols)
df
0 1 2
0 year col1 col2
1 year aaa bbb
2 2017 aaa bbb
3 2018 col1 col2

Example 2 - Political Parties in Wisconsin:

Here, I will parse the HTML table linked in the description.

## Go to the link and download the page's html code:
url = "https://en.wikipedia.org/wiki/Political_party_strength_in_Wisconsin"
site = fetch_website(url)
soup = bs(site, 'lxml')
## Find tables on the page and locate the desired one:
## Caveat: note that I explicitly search for a wikitable!  
tables = soup.findAll("table", class_='wikitable')

## I want table 3 or the one that contains years 2000-2018
table = tables[3]
## run the above functions to extract the data
rows, num_rows, num_cols = pre_process_table(table)
df = process_rows(rows, num_rows, num_cols)

## print the result
df.head()
0 1 2 3 4 5 6 7 8 9 10 11
0 Year Executive offices Executive offices Executive offices Executive offices Executive offices State Legislature State Legislature United States Congress United States Congress United States Congress Electoral College votes
1 Year Governor Lieutenant Governor Secretary of State Attorney General Treasurer State Senate State Assembly U.S. Senator (Class I) U.S. Senator (Class III) U.S. House Electoral College votes
2 2000 Tommy Thompson (R) Scott McCallum (R) Doug La Follette (D) Jim Doyle (D) Jack Voight (R) 17D, 16R 55R, 44D Herb Kohl (D) Russ Feingold (D) 5D, 4R Gore/Lieberman (D) N
3 2001 Scott McCallum (R) Margaret Farrow (R) Doug La Follette (D) Jim Doyle (D) Jack Voight (R) 18D, 15R 56R, 43D Herb Kohl (D) Russ Feingold (D) 5D, 4R Gore/Lieberman (D) N
4 2002 Scott McCallum (R) Margaret Farrow (R) Doug La Follette (D) Jim Doyle (D) Jack Voight (R) 18D, 15R 56R, 43D Herb Kohl (D) Russ Feingold (D) 5D, 4R Gore/Lieberman (D) N

Example 3 - Cricket

Stackoverflow had a Wikipedia parsing question, thought I would use the page of interest.

url = "http://en.wikipedia.org/wiki/List_of_England_Test_cricket_records"
site = fetch_website(url)
soup = bs(site, 'lxml')
tables = soup.findAll("table", class_='wikitable')

## I want table 3 or the one that contains years 2000-2018
table = tables[3]

rows, num_rows, num_cols = pre_process_table(table)
df = process_rows(rows, num_rows, num_cols)

## print the result
df.head()
0 1 2 3 4 5 6
0 Rank Score Opposition Venue Place Season Result
1 1 45 v  Australia Sydney Cricket Ground Sydney, Australia 1886/87 Won
2 2 46 v  West Indies Queen's Park Oval Port of Spain, Trinidad and Tobago 1994 Lost
3 3 51 v  West Indies Sabina Park Kingston, Jamaica 2009 Lost
4 4 52 v  Australia The Oval London, England 1948 Lost

Example 4 - Political Parties in DC

url = "https://en.wikipedia.org/wiki/Political_party_strength_in_Washington,_D.C."
site = fetch_website(url)
soup = bs(site, 'lxml')
tables = soup.findAll("table", class_='wikitable')

## I want table 3 or the one that contains years 2000-2018
table = tables[0]

rows, num_rows, num_cols = pre_process_table(table)
df = process_rows(rows, num_rows, num_cols)

## print the result
df.head()
0 1 2 3 4 5 6 7 8 9
0 Year Executive office Executive office District Council District Council U.S. Congress U.S. Congress U.S. Congress U.S. Congress Presidential electoral college votes
1 Year Mayor Attorney General Chair Composition U.S. House Delegate Shadow U.S. Senator (Seat 1) Shadow U.S. Senator (Seat 2) Shadow Representative Presidential electoral college votes
2 1801\n...\n1870 [1] [1] [1] [1] [1] [1] [1] [1] [2]
3 1871 [3] [3] [3] [3] Norton P. Chipman (R) [4] [4] [4] [2]
4 1872 [3] [3] [3] [3] Norton P. Chipman (R) [4] [4] [4] [2]