BazerUtils.jl

Assorted Julia utilities including custom logging
Log | Files | Refs | README | LICENSE

html_tables.jl (10611B)


      1 using Test
      2 using BazerUtils
      3 using DataFrames
      4 
      5 @testset "HTMLTables" begin
      6 
      7 # ==================================================================================
      8 # Tier 1: Core table parsing
      9 # ==================================================================================
     10 
     11 @testset "Tier 1: Core parsing" begin
     12 
     13 @testset "basic table with thead/tbody" begin
     14     html = """
     15     <table>
     16       <thead><tr><th>A</th><th>B</th></tr></thead>
     17       <tbody><tr><td>1</td><td>2</td></tr>
     18              <tr><td>3</td><td>4</td></tr></tbody>
     19     </table>"""
     20     dfs = read_html_tables(html)
     21     @test length(dfs) == 1
     22     df = dfs[1]
     23     @test names(df) == ["A", "B"]
     24     @test size(df) == (2, 2)
     25     @test df[1, "A"] == "1"
     26     @test df[2, "B"] == "4"
     27 end
     28 
     29 @testset "table without thead (auto-detect from th rows)" begin
     30     html = """
     31     <table>
     32       <tr><th>X</th><th>Y</th></tr>
     33       <tr><td>a</td><td>b</td></tr>
     34     </table>"""
     35     dfs = read_html_tables(html)
     36     @test length(dfs) == 1
     37     @test names(dfs[1]) == ["X", "Y"]
     38     @test dfs[1][1, "X"] == "a"
     39 end
     40 
     41 @testset "multiple tbody elements concatenated" begin
     42     html = """
     43     <table>
     44       <thead><tr><th>A</th><th>B</th></tr></thead>
     45       <tbody><tr><td>1</td><td>2</td></tr></tbody>
     46       <tbody><tr><td>3</td><td>4</td></tr></tbody>
     47     </table>"""
     48     dfs = read_html_tables(html)
     49     @test size(dfs[1]) == (2, 2)
     50     @test dfs[1][2, "A"] == "3"
     51 end
     52 
     53 @testset "tfoot with data appended to body" begin
     54     html = """
     55     <table>
     56       <thead><tr><th>A</th><th>B</th></tr></thead>
     57       <tbody><tr><td>1</td><td>2</td></tr></tbody>
     58       <tfoot><tr><td>foot1</td><td>foot2</td></tr></tfoot>
     59     </table>"""
     60     dfs = read_html_tables(html)
     61     @test size(dfs[1]) == (2, 2)
     62     @test dfs[1][2, "A"] == "foot1"
     63 end
     64 
     65 @testset "mixed th/td in body row" begin
     66     html = """
     67     <table>
     68       <thead><tr><th>Country</th><th>City</th><th>Year</th></tr></thead>
     69       <tbody><tr><td>Ukraine</td><th>Odessa</th><td>1944</td></tr></tbody>
     70     </table>"""
     71     dfs = read_html_tables(html)
     72     @test dfs[1][1, "City"] == "Odessa"
     73 end
     74 
     75 @testset "single column table" begin
     76     html = """
     77     <table>
     78       <tr><th>Only</th></tr>
     79       <tr><td>val</td></tr>
     80     </table>"""
     81     dfs = read_html_tables(html)
     82     @test size(dfs[1]) == (1, 1)
     83     @test names(dfs[1]) == ["Only"]
     84 end
     85 
     86 @testset "empty table skipped" begin
     87     html = """
     88     <table><tbody></tbody></table>
     89     <table>
     90       <tr><th>A</th></tr>
     91       <tr><td>1</td></tr>
     92     </table>"""
     93     dfs = read_html_tables(html)
     94     @test length(dfs) == 1
     95     @test names(dfs[1]) == ["A"]
     96 end
     97 
     98 @testset "multiple tables in document" begin
     99     html = """
    100     <table><tr><th>T1</th></tr><tr><td>a</td></tr></table>
    101     <table><tr><th>T2</th></tr><tr><td>b</td></tr></table>
    102     <table><tr><th>T3</th></tr><tr><td>c</td></tr></table>"""
    103     dfs = read_html_tables(html)
    104     @test length(dfs) == 3
    105     @test names(dfs[2]) == ["T2"]
    106 end
    107 
    108 @testset "match kwarg filters tables" begin
    109     html = """
    110     <table><tr><th>Name</th></tr><tr><td>park</td></tr></table>
    111     <table><tr><th>Other</th></tr><tr><td>data</td></tr></table>"""
    112     dfs = read_html_tables(html; match=r"park"i)
    113     @test length(dfs) == 1
    114     @test names(dfs[1]) == ["Name"]
    115 end
    116 
    117 end # Tier 1
    118 
    119 
    120 # ==================================================================================
    121 # Tier 2: Colspan/rowspan
    122 # ==================================================================================
    123 
    124 @testset "Tier 2: Colspan/rowspan" begin
    125 
    126 @testset "colspan=1 and rowspan=1 are no-ops" begin
    127     html = """
    128     <table>
    129       <tr><th>A</th><th colspan="1">B</th><th rowspan="1">C</th></tr>
    130       <tr><td>a</td><td>b</td><td>c</td></tr>
    131     </table>"""
    132     dfs = read_html_tables(html)
    133     @test names(dfs[1]) == ["A", "B", "C"]
    134     @test dfs[1][1, "B"] == "b"
    135 end
    136 
    137 @testset "colspan=2 in header" begin
    138     html = """
    139     <table>
    140       <tr><th colspan="2">Wide</th><th>Narrow</th></tr>
    141       <tr><td>a</td><td>b</td><td>c</td></tr>
    142     </table>"""
    143     dfs = read_html_tables(html)
    144     @test size(dfs[1], 2) == 3
    145     @test dfs[1][1, 1] == "a"
    146     @test dfs[1][1, 3] == "c"
    147 end
    148 
    149 @testset "colspan=2 in body" begin
    150     html = """
    151     <table>
    152       <tr><th>A</th><th>B</th><th>C</th></tr>
    153       <tr><td colspan="2">wide</td><td>c</td></tr>
    154     </table>"""
    155     dfs = read_html_tables(html)
    156     @test dfs[1][1, "A"] == "wide"
    157     @test dfs[1][1, "B"] == "wide"
    158     @test dfs[1][1, "C"] == "c"
    159 end
    160 
    161 @testset "rowspan=2 in body" begin
    162     html = """
    163     <table>
    164       <tr><th>A</th><th>B</th></tr>
    165       <tr><td rowspan="2">tall</td><td>1</td></tr>
    166       <tr><td>2</td></tr>
    167     </table>"""
    168     dfs = read_html_tables(html)
    169     @test size(dfs[1]) == (2, 2)
    170     @test dfs[1][1, "A"] == "tall"
    171     @test dfs[1][2, "A"] == "tall"
    172     @test dfs[1][2, "B"] == "2"
    173 end
    174 
    175 @testset "rowspan at end of row" begin
    176     html = """
    177     <table>
    178       <tr><th>A</th><th>B</th></tr>
    179       <tr><td>x</td><td rowspan="2">y</td></tr>
    180       <tr><td>z</td></tr>
    181     </table>"""
    182     dfs = read_html_tables(html)
    183     @test dfs[1][2, "B"] == "y"
    184     @test dfs[1][2, "A"] == "z"
    185 end
    186 
    187 @testset "both rowspan and colspan on same cell" begin
    188     html = """
    189     <table>
    190       <tr><th>A</th><th>B</th><th>C</th><th>D</th><th>E</th></tr>
    191       <tr><td rowspan="2">a</td><td rowspan="2" colspan="3">block</td><td>e1</td></tr>
    192       <tr><td>e2</td></tr>
    193     </table>"""
    194     dfs = read_html_tables(html)
    195     @test size(dfs[1]) == (2, 5)
    196     @test dfs[1][1, "B"] == "block"
    197     @test dfs[1][1, "C"] == "block"
    198     @test dfs[1][1, "D"] == "block"
    199     @test dfs[1][2, "B"] == "block"
    200     @test dfs[1][2, "D"] == "block"
    201     @test dfs[1][2, "A"] == "a"
    202     @test dfs[1][1, "E"] == "e1"
    203     @test dfs[1][2, "E"] == "e2"
    204 end
    205 
    206 @testset "rowspan spanning header into body" begin
    207     html = """
    208     <table>
    209       <tr><th rowspan="2">A</th><th>B</th></tr>
    210       <tr><td>1</td></tr>
    211       <tr><td>C</td><td>2</td></tr>
    212     </table>"""
    213     dfs = read_html_tables(html)
    214     @test names(dfs[1]) == ["A", "B"]
    215     @test dfs[1][1, "A"] == "A"
    216     @test dfs[1][1, "B"] == "1"
    217     @test dfs[1][2, "A"] == "C"
    218     @test dfs[1][2, "B"] == "2"
    219 end
    220 
    221 @testset "rowspan-only rows" begin
    222     html = """
    223     <table>
    224       <tr><th>A</th><th>B</th></tr>
    225       <tr><td rowspan="3">x</td><td rowspan="3">y</td></tr>
    226     </table>"""
    227     dfs = read_html_tables(html)
    228     @test size(dfs[1]) == (3, 2)
    229     @test dfs[1][3, "A"] == "x"
    230     @test dfs[1][3, "B"] == "y"
    231 end
    232 
    233 end # Tier 2
    234 
    235 
    236 # ==================================================================================
    237 # Tier 3: Multi-level headers + flatten
    238 # ==================================================================================
    239 
    240 @testset "Tier 3: Multi-level headers" begin
    241 
    242 @testset "two th rows give string-tuple column names" begin
    243     html = """
    244     <table>
    245       <tr><th>A</th><th>B</th></tr>
    246       <tr><th>a</th><th>b</th></tr>
    247       <tr><td>1</td><td>2</td></tr>
    248     </table>"""
    249     dfs = read_html_tables(html)
    250     @test names(dfs[1]) == ["(A, a)", "(B, b)"]
    251 end
    252 
    253 @testset "flatten=:join joins with underscore" begin
    254     html = """
    255     <table>
    256       <tr><th>A</th><th>B</th></tr>
    257       <tr><th>a</th><th>b</th></tr>
    258       <tr><td>1</td><td>2</td></tr>
    259     </table>"""
    260     dfs = read_html_tables(html; flatten=:join)
    261     @test names(dfs[1]) == ["A_a", "B_b"]
    262 end
    263 
    264 @testset "flatten=:last takes last level" begin
    265     html = """
    266     <table>
    267       <tr><th>A</th><th>B</th></tr>
    268       <tr><th>a</th><th>b</th></tr>
    269       <tr><td>1</td><td>2</td></tr>
    270     </table>"""
    271     dfs = read_html_tables(html; flatten=:last)
    272     @test names(dfs[1]) == ["a", "b"]
    273 end
    274 
    275 @testset "Wikipedia-style colspan grouping with sub-headers" begin
    276     html = """
    277     <table>
    278       <tr><th rowspan="2">Name</th><th colspan="2">Size</th><th rowspan="2">Year</th></tr>
    279       <tr><th>acres</th><th>ha</th></tr>
    280       <tr><td>Park A</td><td>100</td><td>40</td><td>1920</td></tr>
    281     </table>"""
    282     dfs = read_html_tables(html)
    283     @test names(dfs[1]) == ["(Name, Name)", "(Size, acres)", "(Size, ha)", "(Year, Year)"]
    284     @test dfs[1][1, "(Size, acres)"] == "100"
    285 
    286     dfs2 = read_html_tables(html; flatten=:last)
    287     @test names(dfs2[1]) == ["Name", "acres", "ha", "Year"]
    288 end
    289 
    290 end # Tier 3
    291 
    292 
    293 # ==================================================================================
    294 # Tier 4: Data quality
    295 # ==================================================================================
    296 
    297 @testset "Tier 4: Data quality" begin
    298 
    299 @testset "empty cells become missing" begin
    300     html = """
    301     <table>
    302       <tr><th>A</th><th>B</th></tr>
    303       <tr><td></td><td>val</td></tr>
    304     </table>"""
    305     dfs = read_html_tables(html)
    306     @test ismissing(dfs[1][1, "A"])
    307     @test dfs[1][1, "B"] == "val"
    308 end
    309 
    310 @testset "ragged rows padded with missing" begin
    311     html = """
    312     <table>
    313       <tr><th>A</th><th>B</th><th>C</th></tr>
    314       <tr><td>1</td></tr>
    315     </table>"""
    316     dfs = read_html_tables(html)
    317     @test dfs[1][1, "A"] == "1"
    318     @test ismissing(dfs[1][1, "B"])
    319     @test ismissing(dfs[1][1, "C"])
    320 end
    321 
    322 @testset "br inside cell becomes space" begin
    323     html = """
    324     <table>
    325       <tr><th>A</th></tr>
    326       <tr><td>word1<br>word2</td></tr>
    327     </table>"""
    328     dfs = read_html_tables(html)
    329     @test dfs[1][1, "A"] == "word1 word2"
    330 end
    331 
    332 @testset "style tag stripped from header" begin
    333     html = """
    334     <table>
    335       <tr><th><style>.x{color:red}</style>Name</th><th>B</th></tr>
    336       <tr><td>a</td><td>b</td></tr>
    337     </table>"""
    338     dfs = read_html_tables(html)
    339     @test strip(names(dfs[1])[1]) == "Name" || names(dfs[1])[1] == "Name"
    340 end
    341 
    342 @testset "whitespace normalization" begin
    343     html = """
    344     <table>
    345       <tr><th>  A  </th></tr>
    346       <tr><td>  val  </td></tr>
    347     </table>"""
    348     dfs = read_html_tables(html)
    349     @test names(dfs[1]) == ["A"]
    350     @test dfs[1][1, "A"] == "val"
    351 end
    352 
    353 end # Tier 4
    354 
    355 
    356 # ==================================================================================
    357 # Integration: real Wikipedia page
    358 # ==================================================================================
    359 
    360 @testset "Integration: Wikipedia state parks" begin
    361     try
    362         dfs = read_html_tables(
    363             "https://en.wikipedia.org/wiki/List_of_Alabama_state_parks";
    364             match=r"[Nn]ame", flatten=:last)
    365         @test length(dfs) >= 1
    366         df = dfs[1]
    367         @test any(contains.(lowercase.(names(df)), "name"))
    368         @test nrow(df) > 10
    369     catch e
    370         if e isa HTTP.Exceptions.StatusError || e isa Downloads.RequestError
    371             @warn "Skipping Wikipedia test (network error)"
    372         else
    373             rethrow(e)
    374         end
    375     end
    376 end
    377 
    378 end # HTMLTables