我需要从 this site 获取价格表.
为此,我已经开发了一些代码:
Sub TableExample()
Dim IE As Object
Dim doc As Object
Dim strURL As String
strURL = "http://www.idealo.de/preisvergleich/OffersOfProduct/143513.html"
' replace with URL of your choice
Set IE = CreateObject("InternetExplorer.Application")
With IE
'.Visible = True
.navigate strURL
Do Until .readyState = 4: DoEvents: Loop
Do While .Busy: DoEvents: Loop
Set doc = IE.document
GetAllTables doc
.Quit
End With
End Sub
Sub GetAllTables(doc As Object)
' get all the tables from a webpage document, doc, and put them in a new worksheet
Dim ws As Worksheet
Dim rng As Range
Dim tbl As Object
Dim rw As Object
Dim cl As Object
Dim tabno As Long
Dim nextrow As Long
Dim I As Long
Set ws = Sheets("Sheet1")
For Each tbl In doc.getElementsByTagName("TABLE")
tabno = tabno + 1
nextrow = nextrow + 1
Set rng = ws.Range("B" & nextrow)
'rng.Offset(, -1) = "Table " & tabno
If tabno = 5 Then
For Each rw In tbl.Rows
colno = 6
For Each cl In rw.Cells
If colno = 5 And nextrow < 1 Then
Set classColl = doc.getElementsByClassName("shop")
Set imgTgt = classColl(nextrow - 2).getElementsByTagName("img").getElementsByClassName("btn-goto-shop")
rng.Value = imgTgt(0).getAttribute("alt")
Else
rng.Value = cl.innerText
End If
Set rng = rng.Offset(, 1)
I = I + 1
colno = colno + 1
Next cl
nextrow = nextrow + 1
Set rng = rng.Offset(1, -I)
I = 0
Next rw
End If
Next tbl
ws.Cells.ClearFormats
End Sub
通过此代码,我可以获得所需的结果,除了未获取带有给定商店名称的最后一列。谁能帮我这个?
最佳答案
如果您检查页面的 HTML,您可以看到带有 className productOffers-listItemOfferPrice
的元素有所需的信息。有比你可能意识到的更多的信息。在底部查看我的代码输出。
在主子GetTable
,我使用 XHR 请求来获取页面 HTML 并将其存储在 HTML 文档中。
当你这样做 .getElementsByClassName("productOffers-listItemOfferPrice")
要获取所有项目信息,您需要解析每个元素 .outerHTML
.
辅助函数 GetTransactionInfo
使用 split 函数仅获取 .outerHTML
的产品信息部分.返回的字符串类似于以下示例:
" 			"product_id": 			"143513", 			"product_name": ..."
辅助函数
TidyString
采用 inputString 和正则表达式模式,应用正则表达式模式匹配来整理产品信息字符串,方法是匹配不需要的字符串并将它们替换为空文字字符串 (vbNullString
)。正则表达式模式 1:
例如,第一个正则表达式模式
"&#\d+;"
去掉字符串中所有带有数字的 :Try it
正则表达式模式 2:
第二个正则表达式模式,
Chr$(34) & headers(currentItem) & Chr$(34) & ":"
,从字符串中删除产品标题信息,即仅获取值。例如。需要
"product_id": "143513"
并返回 "143513"
.Try it
示例页面信息(示例)
示例代码输出:
VBA 代码:
Option Explicit
'Tools > References > HTML Object Library
Public Sub GetTable()
Dim sResponse As String, listItems As Object, html As HTMLDocument, headers()
headers = Array("product_id", "product_name", "product_price", "product_category", "currency", "spr", "shop_name", "delivery_time", "shop_rating", "position", "free_return", "approved_shipping")
Application.ScreenUpdating = False
With CreateObject("MSXML2.XMLHTTP")
.Open "GET", "https://www.idealo.de/preisvergleich/OffersOfProduct/143513.html", False
.send
sResponse = StrConv(.responseBody, vbUnicode)
End With
sResponse = Mid$(sResponse, InStr(1, sResponse, "<!DOCTYPE "))
Set html = New HTMLDocument
With html
.body.innerHTML = sResponse
Set listItems = .getElementsByClassName("productOffers-listItemOfferPrice")
End With
Dim currentItem As Long
With ActiveSheet
.Cells(1, 1).Resize(1, UBound(headers) + 1) = headers
For currentItem = 0 To listItems.Length - 1
Dim tempString As String, columnValues() As String
tempString = TidyString(GetTransactionInfo(listItems(currentItem).outerHTML), "&#\d+;")
columnValues = GetColumnValues(tempString, headers)
.Cells(currentItem + 2, 1).Resize(1, UBound(columnValues) + 1) = columnValues
Next currentItem
End With
Application.ScreenUpdating = True
End Sub
Public Function GetTransactionInfo(ByVal inputString) As String
'Split to get just the transaction items i.e. Headers and associated values
GetTransactionInfo = Split(Split(inputString, """transaction"",")(1), "}")(0)
End Function
Public Function TidyString(ByVal inputString As String, ByVal matchPattern As String) As String
'Extract transaction info
'Use regex to find these unwanted strings and replace pattern e.g. &#\d+;
'Example inputString
Dim regex As Object, tempString As String
Set regex = CreateObject("VBScript.RegExp")
With regex
.Global = True
.MultiLine = True
.IgnoreCase = False
.Pattern = matchPattern
End With
If regex.TEST(inputString) Then
TidyString = regex.Replace(inputString, vbNullString)
Else
TidyString = inputString
End If
End Function
Public Function GetColumnValues(ByVal inputString As String, ByVal headers As Variant) As Variant
' Example input string "product_id": "143513","product_name": "Canon 500D Nahlinse 72mm","product_price": "128.0","product_category": "26570","currency": "EUR","spr": "cfd","shop_name": "computeruniverse.net","delivery_time": "long","shop_rating": "100","position": "1","free_return": "14","approved_shipping": "false"
' Extract just the inner string value of each header e.g. 143513
Dim arr() As String, currentItem As Long, tempString As String
tempString = inputString
For currentItem = LBound(headers) To UBound(headers)
tempString = TidyString(tempString, Chr$(34) & headers(currentItem) & Chr$(34) & ":")
Next currentItem
arr = Split(Replace$(tempString, Chr$(34), vbNullString), ",")
GetColumnValues = arr
End Function
关于internet-explorer - VBA 未完全获取 Web 表,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/21907715/