vba - XHTML 网站抓取指南

标签 vba excel parsing web-scraping xmlhttprequest

我是 VBA 和 HTML/XHTML 的新手,但通过在线研究和其他出色成员的帮助,我设法编写了一段代码来提取我想要的数据。我很难确定我想要的元素的 ID,因为它是在 XHTML 中,所以我认为这是我最糟糕的地方。

网址: http://www.usbanklocations.com/banks.php?q=&ct=&ml=30&lc=

这是我希望代码执行的操作: 提取银行名称、地址、电话号码、总存款和总 Assets ——鉴于我在 Excel 工作表中提供的银行名称和城市。

这是我的代码:

Private Declare Sub Sleep Lib "kernel32.dll" (ByVal dwMilliseconds As Long)
Sub CommunityBanks()
    Dim IE As Object, TableResults As Object, webRow As Object, BankName As Variant, page As Long, pageTotal As Long, r As Long
    Dim beginTime As Date, i As Long, myvalue As Variant

Set IE = CreateObject("internetexplorer.application")
IE.navigate "http://www.usbanklocations.com/banks.php?name=" & Range("A2").Value & "+Bank&ml=30&lc=" & Range("B2").Value & "%2C+TX"
IE.Visible = True

Do While IE.Busy Or IE.readystate <> 4   '4 = READYSTATE_COMPLETE
    DoEvents
Loop

'input bank name into form
'myvalue = InputBox("Enter City. Press okay to begin search", "Bank Search")
'Range("F3").Value = myvalue
'IE.document.getelementbyid("MainContent_txtCity").Value = "LegacyTexas"
'click find button
'IE.document.getelementbyid("MainContent_btn").Click
'Sleep 5 * 1000
IE.document.getelementbytagname("table").getelementsbyclassname("btn").Click
Sleep 5 * 1000

'total pages
pageTotal = IE.document.getelementbyid("lsortby").innertext
page = 0

Do Until page = pageTotal
    DoEvents
    page = IE.document.getelementbyclassname("lsortby").innertext
    With IE.document.getelementbyid("main")
        For r = 1 To .Rows.Length - 1
            If Not IsArray(BankName) Then
                ReDim BankName(7, 0) As Variant
            Else
                ReDim Preserve BankName(7, UBound(BankName, 2) + 1) As Variant
            End If

            BankName(0, UBound(BankName, 2)) = .Rows(r).Cells(0).innertext
        Next r
    End With

    If page < pageTotal Then
        IE.document.getelementbyclassname("panelpn").Click
        beginTime = Now
        Application.Wait (Now + TimeValue("00:00:05"))
    End If
Loop

For r = 0 To UBound(BankName, 2)
    IE.navigate "http://www.usbanklocations.com/" & BankName(0, r)
    Do While IE.Busy Or IE.readystate <> 4   '4 = READYSTATE_COMPLETE
        DoEvents
    Loop
    'wait 5 sec. for screen refresh
    Sleep 5 * 1000

    With IE.document.getelementbytagname("table")
        For i = 0 To .Rows.Length - 1
            DoEvents
            Select Case .Rows(i).Cells(0).innertext
            Case "Name:"
                BankName(1, r) = .Rows(i).Cells(1).innertext
            Case "Location:"
                BankName(2, r) = .Rows(i).Cells(1).innertext
            Case "Phone:"
                BankName(3, r) = .Rows(i).Cells(1).innertext
            Case "Branch Deposit:"
                BankName(4, r) = Replace(Replace(.Rows(i).Cells(1).innertext, ",", ""), "$", "")
            Case "Total Assets:"
                BankName(5, r) = Replace(Replace(.Rows(i).Cells(1).innertext, ",", ""), "$", "")
            End Select
        Next i
    End With
Next r


IE.Quit
Set IE = Nothing

'post result on Excel cell
Worksheets(1).Range("A9").Resize(UBound(BankName, 2) + 1, UBound(BankName, 1) + 1).Value = Application.Transpose(BankName)
End Sub

提前致谢!我将不胜感激任何帮助。

最佳答案

考虑下面的示例,它使用 XHR 而不是 IE 和基于拆分的 HTML 内容解析:

Option Explicit

Sub Test_usbanklocations()

    Dim oSource, oDestination, y, oSrcRow, sName, sCity, sDist, sUrl0, sUrl1, sUrl2, lPage, sResp1, sResp2, i, a1, a2, a3, a4, a5

    Set oSource = Sheets(1)
    Set oDestination = Sheets(2)
    oDestination.Cells.Delete
    DataOutput oDestination, 1, Array("Name", "Location", "Phone", "Total Assets", "Total Deposits")
    y = 2

    For Each oSrcRow In oSource.UsedRange.Rows
        sName = oSrcRow.Cells(1, 1).Value
        sCity = oSrcRow.Cells(1, 2).Value
        sDist = oSrcRow.Cells(1, 3).Value
        sUrl0 = "http://www.usbanklocations.com/banks.php?q=" & EncodeUriComponent(sName) & "&lc=" & EncodeUriComponent(sCity) & "&ml=" & sDist
        sUrl1 = sUrl0
        lPage = 1
        Do
            sResp1 = GetXHR(sUrl1)
            If InStr(sResp1, "We can not find the address you provided. Please check.") > 0 Then Exit Do
            a1 = Split(sResp1, "<div class=""pl")
            For i = 1 To UBound(a1)
                a2 = Split(a1(i), "</div>", 3)
                a3 = Split(a2(1), "<a href=""", 2)
                a4 = Split(a3(1), """>", 2)
                sUrl2 = "http://www.usbanklocations.com" & a4(0)
                sResp2 = GetXHR(sUrl2)
                a5 = Array( _
                    GetFragment(sResp2, "<b>Name:</b></td><td>", "</td>"), _
                    Replace(GetFragment(sResp2, "<b>Location:</b></td><td>", "</td>"), "View Other Branches", ""), _
                    GetFragment(sResp2, "<b>Phone:</b></td>", "</td>"), _
                    GetFragment(sResp2, "<b>Total Assets:</b></td><td>", "</td>"), _
                    GetFragment(sResp2, "<b>Total Deposits:</b></td><td>", "</td>") _
                )
                DataOutput oDestination, y, a5
                y = y + 1
                DoEvents
            Next
            If InStr(sResp1, "Next Page &gt;") = 0 Then Exit Do
            lPage = lPage + 1
            sUrl1 = sUrl0 & "&ps=" & lPage
            DoEvents
        Loop
    Next

    MsgBox "Completed"

End Sub

Function GetXHR(sUrl)

    With CreateObject("MSXML2.XMLHTTP")
        .Open "GET", sUrl, False
        .Send
        GetXHR = .ResponseText
    End With

End Function

Sub DataOutput(oSht, y, aValues)

    With oSht.Cells(y, 1).Resize(1, UBound(aValues) + 1)
        .NumberFormat = "@"
        .Value = aValues
    End With

End Sub

Function GetFragment(sText, sPatt1, sPatt2)

    Dim a1, a2

    a1 = Split(sText, sPatt1, 2)
    If UBound(a1) <> 1 Then Exit Function
    a2 = Split(a1(1), sPatt2, 2)
    If UBound(a2) <> 1 Then Exit Function
    GetFragment = GetInnerText(a2(0))

End Function

Function EncodeUriComponent(sText)

    Static objHtmlfile As Object

    If objHtmlfile Is Nothing Then
        Set objHtmlfile = CreateObject("htmlfile")
        objHtmlfile.parentWindow.execScript "function encode(s) {return encodeURIComponent(s)}", "jscript"
    End If
    EncodeUriComponent = objHtmlfile.parentWindow.encode(sText)

End Function

Function GetInnerText(sText)

    With CreateObject("htmlfile")
        .Write ("<body>" & sText & "</body>")
        GetInnerText = .DocumentElement.Document.GetElementsByTagName("body")(0).InnerText
    End With

End Function

例如,第一个工作表包含要搜索的数据(银行名称、位置和要优化的距离):

source

那么第二张工作表的结果如下:

result

关于vba - XHTML 网站抓取指南,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/37797346/

相关文章:

excel - 如何将单元格值添加到公式栏中的某些部分

java - 是否有与 scala 的解析器生成器等效的 java 版本?

parsing - 使用 Haskell 将随机嵌套列表展平为非嵌套列表

excel - VBA:如何对值列表使用like运算符?

vba - 在Excel VBA中手动设置Application.Caller

excel - 如何用 POI SS/XSSF 转置工作表?

VBA循环遍历单元格范围失败

c# - 从 ObservableCollection<t>.Where 获取索引和对象

html - 需要帮助从 HTML 获取表格

regex - VBA 正则表达式 - 匹配不以特定单词开头的表达式