javascript - 将 HTML 文档的 javascript 部分中的字段提取到表中?地理坐标

标签 javascript json excel web-scraping vba

我有一个 HTML 文档,其中包含 JavaScript block 中的地理信息。这是这个网页的源代码:https://energy.ehawaii.gov/epd/public/energy-projects-map.html

这可以作为 map 也可以作为列表查看。

我想要实现的是在 Excel 中包含该列表,但有一个“纬度”字段和一个“经度”字段。 Google map 标记指定 LatLng在 JavaScript 中。

如何使用VB之类的东西来处理HTML文件的源代码,并组织成具有以下字段/列的表:

  • 说明(来自 <a ... title="such and such">)
  • 技术(例如来自 <p>Technology: Solar</p>)
  • 纬度(来自 google.maps.LatLng(latitude, longitude);
  • 经度(来自与纬度相同的代码行,但使用第二个变量)?

感谢所有帮助!

最佳答案

尝试这个基于 XMLHTTP 请求的 VBScript 解决方案。只需复制下面的代码,粘贴到文本文件,将其另存为 .vbs 并运行它。脚本尚未优化,所有请求都不是异步的,因此在我的电脑上大约需要 40 秒才能获取所有数据。

Option Explicit
Dim arrCells(), arrList, arrTmp, sRespHeaders, sRespText, arrSetHeaders, i, j, iTotal, oApp, oWB, oWS, oOutput

' Create output window
Output oOutput

' Get cookies
oOutput.write "Get cookies"
XmlHttpGet "https://energy.ehawaii.gov/epd/public/energy-projects-map.html", Array(), sRespHeaders, sRespText
ParseResponse "^Set-(Cookie): ([\S]*?=[\S]*?);[\s\S]*?$", sRespHeaders, arrSetHeaders

' Get project list
oOutput.write "Get project list"
arrList = Array()
XmlHttpGet "https://energy.ehawaii.gov/epd/public/energy-projects-list.json?sEcho=2&iColumns=5&sColumns=&iDisplayStart=1&iDisplayLength=0&mDataProp_0=0&mDataProp_1=1&mDataProp_2=2&mDataProp_3=3&mDataProp_4=4&sSearch=&bRegex=false&sSearch_0=&bRegex_0=false&bSearchable_0=true&sSearch_1=&bRegex_1=false&bSearchable_1=true&sSearch_2=&bRegex_2=false&bSearchable_2=true&sSearch_3=&bRegex_3=false&bSearchable_3=true&sSearch_4=&bRegex_4=false&bSearchable_4=true&iSortCol_0=0&sSortDir_0=asc&iSortingCols=1&bSortable_0=true&bSortable_1=true&bSortable_2=true&bSortable_3=true&bSortable_4=true", arrSetHeaders, "", sRespText
ParseProjects sRespText, arrList, iTotal
oOutput.write "Get project list: " & (UBound(arrList) + 1) & " of " & iTotal

' Rearrange to 2-dimensional array, get LatLng
ReDim arrCells(UBound(arrList), 8) ' Name, Technology, Island, Capacity, Location, RID, Type, Lat, Lng
For i = 0 To UBound(arrList)
    For j = 0 To 6
        arrCells(i, j) = arrList(i)(j)
    Next
    oOutput.write "Get LatLng: " & (i + 1) & " of " & iTotal
    arrTmp = RequestLatLng(arrList(i)(5))
    arrCells(i, 7) = arrTmp(0)
    arrCells(i, 8) = arrTmp(1)
Next

' Create Excel worksheet, output data
oOutput.write "Export to Excel"
Set oApp = CreateObject("Excel.Application")
oApp.Visible = True
Set oWB = oApp.Workbooks.Add(-4167) ' xlWBATWorksheet
Set oWS = oWB.Worksheets(1)
oWS.Range(oWS.Cells(1, 1), oWS.Cells(UBound(arrCells) + 1, 9)).Value = arrCells
oWS.Columns.AutoFit
oWB.Saved = True
oOutput.write "Completed"

Sub XmlHttpGet(sQuery, arrSetHeaders, sRespHeaders, sRespText)
    Dim arrHeader
    With CreateObject("MSXML2.ServerXMLHTTP")
        .SetOption 2, 13056 ' SXH_SERVER_CERT_IGNORE_ALL_SERVER_ERRORS
        .Open "GET", sQuery, False
        For Each arrHeader In arrSetHeaders
            .SetRequestHeader arrHeader(0), arrHeader(1)
        Next
        .Send ""
        sRespHeaders = .GetAllResponseHeaders
        sRespText = .ResponseText
    End With
End Sub

Sub ParseResponse(sPattern, sResponse, aData)
    Dim oMatch, aTmp, sSubMatch
    aData = Array()
    With CreateObject("VBScript.RegExp")
        .Global = True
        .MultiLine = True
        .Pattern = sPattern
        For Each oMatch In .Execute(sResponse)
            If oMatch.SubMatches.Count = 1 Then
                PushItem aData, oMatch.SubMatches(0)
            Else
                aTmp = Array()
                For Each sSubMatch In oMatch.SubMatches
                    PushItem aTmp, sSubMatch
                Next
                PushItem aData, aTmp
            End If
        Next
    End With
End Sub

Sub PushItem(aList, vItem)
    ReDim Preserve aList(UBound(aList) + 1)
    aList(UBound(aList)) = vItem
End Sub

Sub ParseProjects(sJson, arrProj, iTotalRecords)
    Dim i, q
    With CreateObject("htmlfile")
        With .parentwindow
            .execscript ";", "jscript"
            .eval ("json = " & sJson & ";")
            iTotalRecords = CInt(.json.iTotalRecords)
            Do While .json.aaData.Length
                ReDim Preserve arrProj(UBound(arrProj) + 1)
                With .json.aaData.Shift()
                    arrProj(UBound(arrProj)) = Array(.Shift(), .Shift(), .Shift(), .Shift(), .Shift(), .Shift(), .Shift())
                End With
            Loop
        End With
    End With
End Sub

Function RequestLatLng(sRid)
    Dim sRespText, arrTmp, sTmp
    XmlHttpGet "https://energy.ehawaii.gov/epd/public/energy-project-details.html?rid=" & sRid, Array(), "", sRespText
    arrTmp = Split(sRespText, "google.maps.LatLng(")
    If UBound(arrTmp) >= 1 Then
        sTmp = arrTmp(1)
        arrTmp = Split(sTmp, "),")
        If UBound(arrTmp) >= 1 Then
            RequestLatLng = Split(arrTmp(0), ", ")
            Exit Function
        End If
    End If
    RequestLatLng = Array("#", "#")
End Function

Sub Output(oWnd)
    Set oWnd = ShowWindow("energy.ehawaii.gov", "", 354, 118)
End Sub

Function ShowWindow(sTitle, sBG, iWidth, iHeight)
    Set ShowWindow = CreateWindow()
    With ShowWindow
        With .document
            .title = sTitle
            .getElementsByTagName("head")(0).appendChild .createElement("style")
            .styleSheets(0).cssText = "* {font: 8pt tahoma; margin: 5px;}"
            .body.style.background = "buttonface"
            .body.style.backgroundRepeat = "no-repeat"
            .body.style.backgroundImage = "url(" & sBG & ")"
            .body.innerHTML = ""
        End With
        .resizeTo .screen.availWidth, .screen.availHeight
        .resizeTo iWidth + .screen.availWidth - .document.body.offsetWidth, iHeight + .screen.availHeight - .document.body.offsetHeight
        .moveTo CInt((.screen.availWidth - iWidth) / 2), CInt((.screen.availHeight - iHeight) / 2)
        .execScript "var handlers, thunks = {body_onunload: function() {handlers.WSHQuit()}};"
        Execute "Class clsHandlers: Public Sub WSHQuit(): WScript.Quit: End Sub: End Class"
        Set .handlers = New clsHandlers
        Set .document.body.onunload = .thunks.body_onunload
        .execScript "var write = function(t) {document.body.innerHTML = t};"
    End With
End Function

Function CreateWindow()
    ' source http://forum.script-coding.com/viewtopic.php?pid=75356#p75356
    Dim sSignature, oShellWnd, oProc
    On Error Resume Next
    sSignature = Left(CreateObject("Scriptlet.TypeLib").Guid, 38)
    Do
        Set oProc = CreateObject("WScript.Shell").Exec("mshta ""about:<head><script>moveTo(-32000,-32000);document.title=' '</script><hta:application id=app border=thick minimizebutton=no maximizebutton=no scroll=no showintaskbar=yes contextmenu=no selection=no innerborder=no icon=""""/><object id='shell' classid='clsid:8856F961-340A-11D0-A96B-00C04FD705A2'><param name=RegisterAsBrowser value=1></object><script>shell.putproperty('" & sSignature & "',document.parentWindow);</script></head>""")
        Do
            If oProc.Status > 0 Then Exit Do
            For Each oShellWnd In CreateObject("Shell.Application").Windows
                Set CreateWindow = oShellWnd.GetProperty(sSignature)
                If Err.Number = 0 Then Exit Function
                Err.Clear
            Next
        Loop
    Loop
End Function

关于javascript - 将 HTML 文档的 javascript 部分中的字段提取到表中?地理坐标,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/28329807/

相关文章:

jquery - ASP.NET MVC2 使用 Controller 读取 JSON 数据

vba - 如何在vba中保存用户表单中的值?

excel - 查找 ListObject 中的最后一个可见行

javascript - 为什么将 str 数字添加到自身后会将其变成数字?

javascript - 检测文本区域内的标记符号

javascript - 滑动 JS - 一次显示 3 张幻灯片

.net - 使用OLEDB读取文件名中带有额外点的Excel csv文件

javascript - 将 javascript 嵌套函数迁移到 typescript 出现错误

json.Marshal 返回奇怪的结果

javascript - 检查是否存在具有值的组合选择 web 服务 prestashop