我正在尝试从网页中删除 href 链接列表,然后尝试从中删除值。我现在面临代码最多只能处理 5 个链接的问题。如果链接超过 5 个,它将在随机行上显示运行时错误。
我正在从这些网页中提取 href 链接:http://www.bursamalaysia.com/market/listed-companies/company-announcements/#/?category=SH&sub_category=all&alphabetical=All&date_from=28/09/2018
Option Explicit
Sub ScrapLink()
Dim IE As New InternetExplorer, html As HTMLDocument
Application.ScreenUpdating = False
With IE
IE.Visible = False
IE.navigate Cells(1, 1).Value
While .Busy Or .readyState < 4: DoEvents: Wend
Application.Wait Now + TimeSerial(0, 0, 3)
Application.StatusBar = "Trying to go to website?"
DoEvents
Dim links As Object, i As Long
Set links = .document.querySelectorAll("#bm_ajax_container [href^='/market/listed-companies/company-announcements/']")
For i = 1 To links.Length
With ThisWorkbook.Worksheets("Sheet1")
.Cells(i + 1, 1) = links.item(i - 1)
End With
Next i
.Quit
End With
End Sub
Public Sub GetInfo()
Dim IE As New InternetExplorer, headers(), u As Long, resultCollection As Collection
headers = Array("URL", "Name", "No", "Date of change", "# Securities", "Type of Transaction", "Nature of Interest")
Set resultCollection = New Collection
Dim links()
links = Application.Transpose(ThisWorkbook.Worksheets("Sheet1").Range("A2:A100"))
With IE
.Visible = True
For u = LBound(links) To UBound(links)
If InStr(links(u), "http") > 0 Then
.navigate links(u)
While .Busy Or .readyState < 4: DoEvents: Wend
Application.Wait Now + TimeSerial(0, 0, 2)
Dim data As Object, title As Object
With .document.getElementById("bm_ann_detail_iframe").contentDocument
Set title = .querySelector(".formContentData")
Set data = .querySelectorAll(".ven_table tr")
End With
Dim results(), numberOfRows As Long, i As Long, currentRow As Object, td As Object, c As Long, r As Long
numberOfRows = Round(data.Length / 4, 0)
ReDim results(1 To numberOfRows, 1 To 7)
For i = 0 To numberOfRows - 1
r = i + 1
results(r, 1) = links(u): results(r, 2) = title.innerText
Set currentRow = data.item(i * 4 + 1)
c = 3
For Each td In currentRow.getElementsByTagName("td")
results(r, c) = Replace$(td.innerText, "document.write(rownum++);", vbNullString)
c = c + 1
Next td
Next i
resultCollection.Add results
Set data = Nothing: Set title = Nothing
End If
Next u
.Quit
End With
Dim ws As Worksheet, item As Long
If Not resultCollection.Count > 0 Then Exit Sub
If Not Evaluate("ISREF('Results'!A1)") Then '<==Credit to @Rory for this test
Set ws = Worksheets.Add
ws.NAME = "Results"
Else
Set ws = ThisWorkbook.Worksheets("Results")
ws.Cells.Clear
End If
Dim outputRow As Long: outputRow = 2
With ws
.Cells(1, 1).Resize(1, UBound(headers) + 1) = headers
For item = 1 To resultCollection.Count
Dim arr()
arr = resultCollection(item)
For i = LBound(arr, 1) To UBound(arr, 1)
.Cells(outputRow, 1).Resize(1, 7) = Application.WorksheetFunction.Index(arr, i, 0)
outputRow = outputRow + 1
Next
Next
End With
End Sub
最佳答案
讨论:
至少从我的测试来看,问题很可能是由于其中一个链接没有表 Details of changes
,因此 numberOfRows
变量设置为 0
,这一行:
ReDim results(1 To numberOfRows, 1 To 7)
因为您有 (1 To 0, 1 To 7)
而因索引错误而失败。
使用 this link在 A1 中检索到 30 个 URL。这检索到 link没有那张 table ,而其他人有。
您可以选择如何处理这种情况。以下是一些示例选项:
选项 1: 仅处理 numberOfRows > 0
的页面。这是我举的例子。
选项 2:使用 numberOfRows
进行Select Case
,如果 Case 0
则以一种方式处理页面, Case Else
正常处理。
注意:
1) 您还想重置状态栏:
Application.StatusBar = False
2) 我暂时修复了用于测试的链接范围:
ThisWorkbook.Worksheets("Sheet1").Range("A2:A31")
待办事项:
- 重构为更加模块化,并使用同一个 IE 实例运行整个过程。创建一个类来保存 IE 对象是个好主意。为其提供提取数据、测试结果行数等的方法。
- 添加一些基本的错误处理,例如处理网站连接失败。
使用 numberOfRows > 0 测试的示例处理:
Option Explicit
Sub ScrapeLink()
Dim IE As New InternetExplorer
Application.ScreenUpdating = False
With IE
IE.Visible = True
IE.navigate Cells(1, 1).Value
While .Busy Or .readyState < 4: DoEvents: Wend
' Application.Wait Now + TimeSerial(0, 0, 3)
Application.StatusBar = "Trying to go to website?"
DoEvents
Dim links As Object, i As Long
Set links = .document.querySelectorAll("#bm_ajax_container [href^='/market/listed-companies/company-announcements/']")
For i = 1 To links.Length
With ThisWorkbook.Worksheets("Sheet1")
.Cells(i + 1, 1) = links.item(i - 1)
End With
Next i
.Quit
End With
Application.StatusBar = false
End Sub
Public Sub GetInfo()
Dim IE As New InternetExplorer, headers(), u As Long, resultCollection As Collection
headers = Array("URL", "Name", "No", "Date of change", "# Securities", "Type of Transaction", "Nature of Interest")
Set resultCollection = New Collection
Dim links()
links = Application.Transpose(ThisWorkbook.Worksheets("Sheet1").Range("A2:A31")) '<== I have fixed the range here for testing
With IE
.Visible = True
For u = LBound(links) To UBound(links)
If InStr(links(u), "http") > 0 Then
.navigate links(u)
While .Busy Or .readyState < 4: DoEvents: Wend
Application.Wait Now + TimeSerial(0, 0, 2)
Dim data As Object, title As Object
With .document.getElementById("bm_ann_detail_iframe").contentDocument
Set title = .querySelector(".formContentData")
Set data = .querySelectorAll(".ven_table tr")
End With
Dim results(), numberOfRows As Long, i As Long, currentRow As Object, td As Object, c As Long, r As Long
numberOfRows = Round(data.Length / 4, 0)
If numberOfRows > 0 Then
ReDim results(1 To numberOfRows, 1 To 7)
For i = 0 To numberOfRows - 1
r = i + 1
results(r, 1) = links(u): results(r, 2) = title.innerText
Set currentRow = data.item(i * 4 + 1)
c = 3
For Each td In currentRow.getElementsByTagName("td")
results(r, c) = Replace$(td.innerText, "document.write(rownum++);", vbNullString)
c = c + 1
Next td
Next i
resultCollection.Add results
Set data = Nothing: Set title = Nothing
End If
End If
Next u
.Quit
End With
Dim ws As Worksheet, item As Long
If Not resultCollection.Count > 0 Then Exit Sub
If Not Evaluate("ISREF('Results'!A1)") Then '<==Credit to @Rory for this test
Set ws = Worksheets.Add
ws.NAME = "Results"
Else
Set ws = ThisWorkbook.Worksheets("Results")
ws.Cells.Clear
End If
Dim outputRow As Long: outputRow = 2
With ws
.Cells(1, 1).Resize(1, UBound(headers) + 1) = headers
For item = 1 To resultCollection.Count
Dim arr()
arr = resultCollection(item)
For i = LBound(arr, 1) To UBound(arr, 1)
.Cells(outputRow, 1).Resize(1, 7) = Application.WorksheetFunction.Index(arr, i, 0)
outputRow = outputRow + 1
Next
Next
End With
End Sub
示例结果:
关于html - 从 href 链接列表中抓取数据?,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/52579706/