excel - 在 Excel 中获取带有超链接的网页表格和带有 VBA 的表格

标签 excel vba internet-explorer web-scraping xmlhttprequest

我正在使用此脚本通过 Microsoft Excel 获取网页的文本数据,但是,它只返回文本,但我想在单独的列中获取超链接。请你帮助我好吗?
该命令似乎只返回文本数据,但我正在寻找将文本和相应的 URL 保存为文本(当然不是超链接!)。

我校审了
https://msdn.microsoft.com/en-us/library/office/ff836520.aspx但我什么也找不到。

您可能会在代码中看到带有提供的 url 的网页。

Sub SaveUrl()
    Set shFirstQtr = Workbooks(1).Worksheets(1)
    Set qtQtrResults = shFirstQtr.QueryTables _
                       .Add(Connection:="URL;http://www.tsetmc.com/Loader.aspx?ParTree=111C1417", _
                            Destination:=shFirstQtr.Cells(1, 1))
    With qtQtrResults
        .WebFormatting = xlNone
        .WebSelectionType = xlSpecifiedTables
        .WebTables = "1"
        .Refresh
    End With
End Sub

最佳答案

以下示例展示了如何自动化 IE 并从 DOM 检索必要的数据(运行 TestIE()),以及使用 XHR 发出请求并使用 RegEx 解析响应(运行 TestXHR()):

Option Explicit

' The code to automate IE and retrieve the necessary data from DOM

Sub TestIE()

    Dim aText() As Variant
    Dim aHref() As Variant
    Dim aHrefExists() As Boolean
    Dim aRes() As Variant
    Dim lRowsCount As Long
    Dim lCellsCount As Long
    Dim i As Long
    Dim j As Long
    Dim lCellsTotal As Long
    Dim x As Long

    With CreateObject("InternetExplorer.Application")
        ' Make visible for debug
        .Visible = True
        ' Navigate to page
        .Navigate "http://www.tsetmc.com/Loader.aspx?ParTree=111C1417"
        ' Wait for IE ready
        Do While .ReadyState <> 4 Or .Busy
            DoEvents
        Loop
        ' Wait for document complete
        Do While .Document.ReadyState <> "complete"
            DoEvents
        Loop
        ' Wait for target table accessible
        Do While TypeName(.Document.getElementById("tblToGrid")) = "Null"
            DoEvents
        Loop
        ' Process target table
        With .Document.getElementById("tblToGrid")
            ' Get table size
            lRowsCount = .Rows.Length
            lCellsCount = .Rows(0).Cells.Length
            ' Create 2d arrays for texts and hyperlinks values, and for column url existance flag
            ReDim aText(1 To lRowsCount, 1 To lCellsCount)
            ReDim aHref(1 To lRowsCount, 1 To lCellsCount)
            ReDim aHrefExists(1 To lCellsCount)
            ' Process each table row
            For i = 1 To lRowsCount
                With .Rows(i - 1)
                    ' Process each cell
                    For j = 1 To lCellsCount
                        ' Retrieve text content
                        aText(i, j) = .Cells(j - 1).innerText
                        ' Retrieve hyperlink if exists
                        With .Cells(j - 1).getElementsByTagName("a")
                            If .Length = 1 Then
                                aHrefExists(j) = True
                                aHref(i, j) = .Item(0).href
                            End If
                        End With
                    Next
                End With
            Next
        End With
        .Quit
    End With
    ' Create resulting array that includes texts and urls
    lCellsTotal = lCellsCount
    For j = 1 To lCellsCount
        If aHrefExists(j) Then lCellsTotal = lCellsTotal + 1
    Next
    ReDim aRes(1 To lRowsCount, 1 To lCellsTotal)
    ' Populate array with texts and urls
    x = 1
    For j = 1 To lCellsCount
        For i = 1 To lRowsCount
            aRes(i, x) = aText(i, j)
        Next
        x = x + 1
        If aHrefExists(j) Then
            For i = 1 To lRowsCount
                aRes(i, x) = aHref(i, j)
            Next
            x = x + 1
        End If
    Next
    ' Result output to sheet 1
    With Sheets(1)
        .Cells.Delete
        Output .Cells(1, 1), aRes
    End With
End Sub

' The code to make request with XHR and parse response with RegEx

Sub TestXHR()

    Dim sRespText As String
    Dim oRERows As Object
    Dim oRECells As Object
    Dim aRes() As Variant
    Dim lRowsCount As Long
    Dim lCellsCount As Long
    Dim i As Long
    Dim j As Long
    Dim lCellsTotal As Long
    Dim x As Long

    ' Retrieve HTML content
    With CreateObject("MSXML2.XMLHttp")
        .Open "GET", "http://www.tsetmc.com/Loader.aspx?ParTree=111C1417", False
        .Send
        sRespText = .responseText
    End With
    ' Regular expression for table rows setup
    Set oRERows = CreateObject("VBScript.RegExp")
    With oRERows
        .Global = True
        .MultiLine = True
        .IgnoreCase = True
        .Pattern = "<tr.*?>[\s\S]*?</tr>"
    End With
    ' Regular expression for table cells setup
    Set oRECells = CreateObject("VBScript.RegExp")
    With oRECells
        .Global = True
        .MultiLine = True
        .IgnoreCase = True
        .Pattern = "<td.*?>(?:.*?<a.*?href=(""|')(.*?)\1.*?>(.*?)</a>.*?|(.*?))</td>"
    End With
    ' Execute 1st regexp on response
    With oRERows.Execute(sRespText)
        ' Get table size
        lRowsCount = .Count
        lCellsCount = oRECells.Execute(.Item(0).Value).Count
        ' Create 2d arrays for texts and hyperlinks values, and for column url existance flag
        ReDim aText(1 To lRowsCount, 1 To lCellsCount)
        ReDim aHref(1 To lRowsCount, 1 To lCellsCount)
        ReDim aHrefExists(1 To lCellsCount)
        ' Process each table row
        For i = 1 To lRowsCount
            ' Get 1st regexp match value, and execute 2nd regexp on it
            With oRECells.Execute(.Item(i - 1).Value)
            ' Process each cell
            For j = 1 To .Count
                With .Item(j - 1)
                    If .SubMatches(3) <> "" Then
                        ' Retrieve text content only
                        aText(i, j) = .SubMatches(3)
                    Else
                        ' Retrieve text content and hyperlink
                        aText(i, j) = .SubMatches(2)
                        aHref(i, j) = "http://www.tsetmc.com/" & .SubMatches(1)
                        aHrefExists(j) = True
                    End If
                End With
            Next
            End With
        Next
    End With
    ' Create resulting array that includes texts and urls
    lCellsTotal = lCellsCount
    For j = 1 To lCellsCount
        If aHrefExists(j) Then lCellsTotal = lCellsTotal + 1
    Next
    ReDim aRes(1 To lRowsCount, 1 To lCellsTotal)
    ' Populate array with texts and urls
    x = 1
    For j = 1 To lCellsCount
        For i = 1 To lRowsCount
            aRes(i, x) = aText(i, j)
        Next
        x = x + 1
        If aHrefExists(j) Then
            For i = 1 To lRowsCount
                aRes(i, x) = aHref(i, j)
            Next
            x = x + 1
        End If
    Next
    ' Result output to sheet 2
    With Sheets(2)
        .Cells.Delete
        Output .Cells(1, 1), aRes
    End With

End Sub

' Utility section

Sub Output(objDstRng As Range, arrCells As Variant)
    With objDstRng
        .Parent.Select
        With .Resize( _
                UBound(arrCells, 1) - LBound(arrCells, 1) + 1, _
                UBound(arrCells, 2) - LBound(arrCells, 2) + 1)
            .NumberFormat = "@"
            .Value = arrCells
            .Columns.AutoFit
        End With
    End With
End Sub

两种方法都给出相同的结果(在工作表 1 和 2 上):

result

关于excel - 在 Excel 中获取带有超链接的网页表格和带有 VBA 的表格,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/38940804/

相关文章:

javascript - window.close() 在 Windows Phone IE 上不起作用

c# - 用于反透视数据表的 Lambda 表达式

excel - VBA获取组合框到 'suggest'的一个选项

vba - 使用 Powershell 关闭 Excel 应用程序

excel - Application.Match 不起作用 - 类型不兼容

html - Microsoft Edge 和 IE 去掉了页面底部固定页脚的空间

html - 元素宽度的 IE 可计算性问题

excel - 从 Web 抓取到 Excel 时复制数据时出错

vba - 跳过循环中的空单元格

c# - 如何在 Open XML SDK for Excel 中按名称获取工作表