Robust HTML table parser code for tables with rowspan and/or colspans

In scouring the web, it was to the embarrasment of humanity that I could not find a parser that could accurately parse the tables found on Wikipedia such as Wisconsin_political_power; I thus wrote my own.

The issue is that parsers for Wikipedia tables could not handle rowspan and colspan attributes in tables so when these parsers faced such obstacles they either errered out or returned jarmbled tables and entries. Because I needed data on a Wikipedia article in which the data tables had these attributes, I had to somehow figure out how to parse such tables. If you encounter a similar issue, I hope this saves you time.

Below, I include both the functions and a few examples. If you wish to just see the examples, you can scroll past the Parser Functions

Parser functions

The functions below parse HTML tables (BeautifulSoup objects). I also include a convenience function for fetching the HTML of the website though for details, I suggest looking through the examples.

##Libraries used are:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup as bs
import requests

def pre_process_table(table):
    """
    INPUT:
        1. table - a bs4 element that contains the desired table: ie <table> ... </table>
    OUTPUT:
        a tuple of: 
            1. rows - a list of table rows ie: list of <tr>...</tr> elements
            2. num_rows - number of rows in the table
            3. num_cols - number of columns in the table
    """
    rows = [x for x in table.find_all('tr')]

    num_rows = len(rows)
    
    ## get table's column count. Most of often, this will be accurate
    num_cols = max([len(x.find_all(['th','td'])) for x in rows])

    ## ...on occasion, the above fails because some columns have colspan attributes too. The below accounts for that:
    header_rows_set = [x.find_all(['th', 'td']) for x in rows if len(x.find_all(['th', 'td']))>num_cols/2]
    
    num_cols_set = []

    for header_rows in header_rows_set:
        num_cols = 0
        for cell in header_rows:
            row_span, col_span = get_spans(cell)
            num_cols+=len([cell.getText()]*col_span)
            
        num_cols_set.append(num_cols)
    
    num_cols = max(num_cols_set)
    #print(num_cols)
    
    return (rows, num_rows, num_cols)


def get_spans(cell):
        """
        INPUT:
            1. cell - a <td>...</td> or <th>...</th> element that contains a table cell entry
        OUTPUT:
            1. a tuple with the cell's row and col spans
        """
        if cell.has_attr('rowspan'):
            rep_row = int(cell.attrs['rowspan'])
        else: # ~cell.has_attr('rowspan'):
            rep_row = 1
        if cell.has_attr('colspan'):
            rep_col = int(cell.attrs['colspan'])
        else: # ~cell.has_attr('colspan'):
            rep_col = 1 
        
        return (rep_row, rep_col)
 
def process_rows(rows, num_rows, num_cols):
    """
    INPUT:
        1. rows - a list of table rows ie <tr>...</tr> elements
    OUTPUT:
        1. data - a Pandas dataframe with the html data in it
    """
    data = pd.DataFrame(np.ones((num_rows, num_cols))*np.nan)
    for i, row in enumerate(rows):
        col_stat = data.iloc[i,:][data.iloc[i,:].isnull()].index[0]

        for j, cell in enumerate(row.find_all(['td', 'th'])):
            rep_row, rep_col = get_spans(cell)

            #print("cols {0} to {1} with rep_col={2}".format(col_stat, col_stat+rep_col, rep_col))
            #print("\trows {0} to {1} with rep_row={2}".format(i, i+rep_row, rep_row))

            #find first non-na col and fill that one
            while any(data.iloc[i,col_stat:col_stat+rep_col].notnull()):
                col_stat+=1

            data.iloc[i:i+rep_row,col_stat:col_stat+rep_col] = cell.getText()
            if col_stat<data.shape[1]-1:
                col_stat+=rep_col

    return data

def fetch_website(url):
    ## apply a more friendly user agent
    user_agent={'User-agent':'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.18 Safari/537.36'}
    r=requests.get(url, headers=user_agent)
    try:
        #print("Accessed and downloaded URL data")
        return(r.content)
    except ConnectionError:
        print("Skipping this url")
        return(None)    
    
def main(table):
    """
    DESCRIPTION:
        Parse one table and return a Pandas DataFrame
    """
    import pandas as pd
    import numpy as np
    from bs4 import BeautifulSoup as bs
    
    rows, num_rows, num_cols = pre_process_table(table)
    df = process_rows(rows, num_rows, num_cols)
    return(df)

Example - the essence of the game

s = """
<table>
    <tr>
        <th rowspan="2">year</th>
        <th>col1</th>
        <th colspan="2">col2</th>
    </tr>
    <tr>
        <td rowspan="2">aaa</td>
        <td rowspan="2">bbb</td>
        <td rowspan="1">ccc</td>
    </tr>
    <tr>
        <td>2017</td>
        <td rowspan="2">ddd</td>
    </tr>
    <tr>
        <td>2018</td>
        <td>col1</td>
        <td colspan="1">col2</td>
    </tr>
</table>
"""
table = bs(s, 'lxml').find('table')
rows, num_rows, num_cols = pre_process_table(table)
df = process_rows(rows, num_rows, num_cols)
df

	0	1	2
0	year	col1	col2
1	year	aaa	bbb
2	2017	aaa	bbb
3	2018	col1	col2

Example 2 - Political Parties in Wisconsin:

Here, I will parse the HTML table linked in the description.

## Go to the link and download the page's html code:
url = "https://en.wikipedia.org/wiki/Political_party_strength_in_Wisconsin"
site = fetch_website(url)
soup = bs(site, 'lxml')

## Find tables on the page and locate the desired one:
## Caveat: note that I explicitly search for a wikitable!  
tables = soup.findAll("table", class_='wikitable')

## I want table 3 or the one that contains years 2000-2018
table = tables[3]

## run the above functions to extract the data
rows, num_rows, num_cols = pre_process_table(table)
df = process_rows(rows, num_rows, num_cols)

## print the result
df.head()

	0	1	2	3	4	5	6	7	8	9	10	11
0	Year	Executive offices	Executive offices	Executive offices	Executive offices	Executive offices	State Legislature	State Legislature	United States Congress	United States Congress	United States Congress	Electoral College votes
1	Year	Governor	Lieutenant Governor	Secretary of State	Attorney General	Treasurer	State Senate	State Assembly	U.S. Senator (Class I)	U.S. Senator (Class III)	U.S. House	Electoral College votes
2	2000	Tommy Thompson (R)	Scott McCallum (R)	Doug La Follette (D)	Jim Doyle (D)	Jack Voight (R)	17D, 16R	55R, 44D	Herb Kohl (D)	Russ Feingold (D)	5D, 4R	Gore/Lieberman (D) N
3	2001	Scott McCallum (R)	Margaret Farrow (R)	Doug La Follette (D)	Jim Doyle (D)	Jack Voight (R)	18D, 15R	56R, 43D	Herb Kohl (D)	Russ Feingold (D)	5D, 4R	Gore/Lieberman (D) N
4	2002	Scott McCallum (R)	Margaret Farrow (R)	Doug La Follette (D)	Jim Doyle (D)	Jack Voight (R)	18D, 15R	56R, 43D	Herb Kohl (D)	Russ Feingold (D)	5D, 4R	Gore/Lieberman (D) N

Example 3 - Cricket

Stackoverflow had a Wikipedia parsing question, thought I would use the page of interest.

url = "http://en.wikipedia.org/wiki/List_of_England_Test_cricket_records"
site = fetch_website(url)
soup = bs(site, 'lxml')
tables = soup.findAll("table", class_='wikitable')

## I want table 3 or the one that contains years 2000-2018
table = tables[3]

rows, num_rows, num_cols = pre_process_table(table)
df = process_rows(rows, num_rows, num_cols)

## print the result
df.head()

	0	1	2	3	4	5	6
0	Rank	Score	Opposition	Venue	Place	Season	Result
1	1	45	v Australia	Sydney Cricket Ground	Sydney, Australia	1886/87	Won
2	2	46	v West Indies	Queen's Park Oval	Port of Spain, Trinidad and Tobago	1994	Lost
3	3	51	v West Indies	Sabina Park	Kingston, Jamaica	2009	Lost
4	4	52	v Australia	The Oval	London, England	1948	Lost

Example 4 - Political Parties in DC

url = "https://en.wikipedia.org/wiki/Political_party_strength_in_Washington,_D.C."
site = fetch_website(url)
soup = bs(site, 'lxml')
tables = soup.findAll("table", class_='wikitable')

## I want table 3 or the one that contains years 2000-2018
table = tables[0]

rows, num_rows, num_cols = pre_process_table(table)
df = process_rows(rows, num_rows, num_cols)

## print the result
df.head()

	0	1	2	3	4	5	6	7	8	9
0	Year	Executive office	Executive office	District Council	District Council	U.S. Congress	U.S. Congress	U.S. Congress	U.S. Congress	Presidential electoral college votes
1	Year	Mayor	Attorney General	Chair	Composition	U.S. House Delegate	Shadow U.S. Senator (Seat 1)	Shadow U.S. Senator (Seat 2)	Shadow Representative	Presidential electoral college votes
2	1801\n...\n1870	[1]	[1]	[1]	[1]	[1]	[1]	[1]	[1]	[2]
3	1871	[3]	[3]	[3]	[3]	Norton P. Chipman (R)	[4]	[4]	[4]	[2]
4	1872	[3]	[3]	[3]	[3]	Norton P. Chipman (R)	[4]	[4]	[4]	[2]