I am scraping HTML tables with a GET request. The req.responseText is then passed to the module below to process the HTML tables.
My code works 100% but I'm trying to optimise it. It is a large daily crawl. I have 2 questions:
1. In the section where I'm processing the "className = "views-field views-field-field-date-published", I don't want the innerText of the TableCell, I want the innerText of the ...content="2020-10-22T00:00:00+02:00". At the moment I'm processing the innerHTML with a custom function, which works fine. But is there a way to access the content="2020-10-22T00:00:00+02:00" innerText itself, similar to what I'm doing for the href in the TableCell above?
2. I need to do different things with the content of each TableCell, therefore I've resorted to the multiple If statements based on TableCell.className. But it does require a lot of extra looping. Is there a more efficient way to get right to the applicable section?
Thanks in advance for any help.
My code works 100% but I'm trying to optimise it. It is a large daily crawl. I have 2 questions:
1. In the section where I'm processing the "className = "views-field views-field-field-date-published", I don't want the innerText of the TableCell, I want the innerText of the ...content="2020-10-22T00:00:00+02:00". At the moment I'm processing the innerHTML with a custom function, which works fine. But is there a way to access the content="2020-10-22T00:00:00+02:00" innerText itself, similar to what I'm doing for the href in the TableCell above?
2. I need to do different things with the content of each TableCell, therefore I've resorted to the multiple If statements based on TableCell.className. But it does require a lot of extra looping. Is there a more efficient way to get right to the applicable section?
Thanks in advance for any help.
VBA Code:
Option Explicit
Sub ProcessHTMLTables(HTMLString As String)
Dim HTMLDoc As New MSHTML.HTMLDocument
Dim HTMLTable As MSHTML.IHTMLElement
Dim TableRow As MSHTML.IHTMLElement
Dim TableCell As MSHTML.IHTMLElement
Dim Category As String, URL As String, Description As String, PDate As String, ClDate As String, BDate As String
HTMLDoc.body.innerHTML = HTMLString
Set HTMLTable = HTMLDoc.getElementsByTagName("table")(0)
'Debug.Print HTMLTable.innerHTML
If HTMLTable Is Nothing Then
Exit Sub
End If
For Each TableRow In HTMLTable.getElementsByTagName("tr")
'Debug.Print TableRow.innerHTML
For Each TableCell In TableRow.getElementsByTagName("td")
If TableCell.className = "views-field views-field-field-category" Then
Category = TableCell.innerText
ElseIf TableCell.className = "views-field views-field-title" Then
URL = "https://****.com" & Mid(TableCell.getElementsByTagName("a")(0).href, 7)
Description = TableCell.innerText
ElseIf TableCell.className = "views-field views-field-field-date-published" Then
Debug.Print TableCell.innerHTML 'Returns: <span class="date-display-single" property="dc:date" datatype="xsd:dateTime" content="2020-10-22T00:00:00+02:00">22/10/2020</span>
Debug.Print TableCell.innerText 'Returns: 22/10/2020
PDate = ExtractDateValue(TableCell.innerHTML)
ElseIf TableCell.className = "views-field views-field-field-date-closing" Then
Debug.Print TableCell.innerHTML 'Returns: <span class="date-display-single" property="dc:date" datatype="xsd:dateTime" content="2020-10-23T00:00:00+02:00">22/10/2020</span>
Debug.Print TableCell.innerText 'Returns: 23/10/2020
ClDate = ExtractDateValue(TableCell.innerHTML)
ElseIf TableCell.className = "views-field views-field-field-date-briefing" Then
Debug.Print TableCell.innerHTML 'Returns: <span class="date-display-single" property="dc:date" datatype="xsd:dateTime" content="2020-10-24T00:00:00+02:00">22/10/2020</span>
Debug.Print TableCell.innerText 'Returns: 24/10/2020
BDate = ExtractDateValue(TableCell.innerHTML)
End If
Next TableCell
Next TableRow
End Sub