Extract all the links of a website

Excel Facts

What do {} around a formula in the formula bar mean?
{Formula} means the formula was entered using Ctrl+Shift+Enter signifying an old-style array formula.
This code seems to work, listing the links up to the chosen level. However, it is inefficient because it generates multiple duplicated results. I will improve it, probably creating a collection of unique items.

Code:
Dim r%, s$
Sub Request2(URL$, Optional Level% = 1)
Dim n&, ob, oreq As New XMLHTTP60, odoc As New HTMLDocument
oreq.Open "GET", URL, False
oreq.Send
odoc.Body.innerHTML = oreq.responseText
r = r + 1
Cells(r, 1) = Level
Set ob = odoc.getElementsByTagName("a")
For n = 0 To ob.Length - 1
    r = r + 1
    Cells(r, 1) = ob.Item(n).href
Next
If Level Then
    For n = 0 To ob.Length - 1
        If ob.Length > 0 Then
            If ob.Item(n).href Like s Then Request2 ob.Item(n).href, Level - 1
        End If
    Next
End If
End Sub


Sub Main2()                             ' run me
r = 0
s = "*salento*"                         ' to avoid external references
On Error Resume Next
Request2 "https://www.unisalento.it", 5
MsgBox "done"
End Sub
 
Upvote 0
I do not understand, however, that it would be enough to extract all the links, as it currently does and then transfer to the table with the headings only those links (depending on the choice 2,3,4 level etc.) whose level is determined by the number of times of character "/" present in the link.
The problem perhaps is to make sure that the links end up in column "b" when all the cells of the column "a" then fill in the "c" etc.
 
Upvote 0
This is a better version; it loads 331 pages, finding 5101 unique addresses for the chosen example.
I will be back later to finish the project.

Code:
Dim r&, s$, dict As Object


Sub Request22(URL$, Optional Level% = 1)
Dim n&, ob, oreq As New XMLHTTP60, odoc As New HTMLDocument
oreq.Open "GET", URL, False
On Error Resume Next
oreq.Send
On Error GoTo 0
odoc.Body.innerHTML = oreq.responseText
r = r + 1
Cells(r, 1) = Level
Set ob = odoc.getElementsByTagName("a")
For n = 0 To ob.Length - 1
    r = r + 1
    Cells(r, 1) = ob.Item(n).href
Next
If Level Then
    For n = 0 To ob.Length - 1
        If ob.Length > 0 Then
            If ob.Item(n).href Like s And Not dict.Exists(ob.Item(n).href) Then
                Request22 ob.Item(n).href, Level - 1
                dict.Add ob.Item(n).href, ob.Item(n).href
            End If
        End If
    Next
End If
End Sub


Sub Main22()                             ' run me
Set dict = CreateObject("Scripting.Dictionary")
r = 0
s = "*salento*"                         ' to avoid external references
Request22 "https://www.unisalento.it"
Cells(1, 2) = Cells(1, 1)
Cells(2, 2) = "*"
[a:a].AdvancedFilter xlFilterCopy, Range("b1:b2"), Cells(1, 3), True
MsgBox dict.Count & " recursive calls.", , "done"
End Sub
 
Upvote 0
The following example shows how to filter the data, considering the level limit at cell E3 and the desired word at cell E4.
The raw data should be on column C, and the filtered result appears on column H.

Code:
Sub AdvFilt2()
[e1] = ""
[e2].Formula = "=AND(LEN(C2)-LEN(SUBSTITUTE(C2,""/"",""""))<$E$3,NOT(ISERR(FIND($E$4,C2))))"
[c:c].AdvancedFilter xlFilterCopy, [e1:e2], [h1], True
End Sub
 
Upvote 0
I will prepare a consolidated version that extracts all links and filters this list multiple times based on supplied criteria.
 
Upvote 0
I will prepare a consolidated version that extracts all links and filters this list multiple times based on supplied criteria.

I'd like to show you a macro that can be combined with this macro. Just to make you participate.
I'm gonna write you my email here? It seems you can't get private messages.
 
Upvote 0
Features of this version:


  • Column A = all site links
  • Column B = filter criteria
  • Column C = unique links
  • Column D = first result column
  • Column E = second result column

Place the level limit at the third row (D3, E3…) and the desired word at the fourth row (D4, E4…)

Code:
Dim r&, s$, dict As Object

Sub Main22()                             ' run me
Set dict = CreateObject("Scripting.Dictionary")
r = 0
s = "*salento*"                         ' to avoid external references
Request22 "https://www.unisalento.it"
Cells(1, 2) = Cells(1, 1)
Cells(2, 2) = "*"
[a:a].AdvancedFilter xlFilterCopy, Range("b1:b2"), Cells(1, 3), True
AdvFilt2
MsgBox dict.Count & " recursive calls.", , "done"
End Sub

Sub Request22(URL$, Optional Level% = 1)
Dim n&, ob, oreq As New XMLHTTP60, odoc As New HTMLDocument
oreq.Open "GET", URL, False
On Error Resume Next
oreq.Send
On Error GoTo 0
odoc.Body.innerHTML = oreq.responseText
r = r + 1
Cells(r, 1) = Level
Set ob = odoc.getElementsByTagName("a")
For n = 0 To ob.Length - 1
    r = r + 1
    Cells(r, 1) = ob.Item(n).href
Next
If Level Then
    For n = 0 To ob.Length - 1
        If ob.Length > 0 Then
            If ob.Item(n).href Like s And Not dict.Exists(ob.Item(n).href) Then
                Request22 ob.Item(n).href, Level - 1
                dict.Add ob.Item(n).href, ob.Item(n).href
            End If
        End If
    Next
End If
End Sub

Sub AdvFilt2()
Dim r As Range, a, c%
Set r = [d2]
For c = 4 To Cells(3, Columns.Count).End(xlToLeft).Column
    a = Split(r.Address, "$")
    r.Offset(-1) = ""
    r.Formula = "=AND(LEN(c2)-LEN(SUBSTITUTE(c2,""/"",""""))<$" & a(1) & _
    "$3,NOT(ISERR(FIND($" & a(1) & "$4,c2))))"
    [c:c].AdvancedFilter xlFilterCopy, Range(r.Offset(-1), r), r.Offset(4), True
    Set r = r.Offset(, 1)
Next
End Sub
 
Upvote 0

Forum statistics

Threads
1,225,814
Messages
6,187,184
Members
453,411
Latest member
healthcares

We've detected that you are using an adblocker.

We have a great community of people providing Excel help here, but the hosting costs are enormous. You can help keep this site running by allowing ads on MrExcel.com.
Allow Ads at MrExcel

Which adblocker are you using?

Disable AdBlock

Follow these easy steps to disable AdBlock

1)Click on the icon in the browser’s toolbar.
2)Click on the icon in the browser’s toolbar.
2)Click on the "Pause on this site" option.
Go back

Disable AdBlock Plus

Follow these easy steps to disable AdBlock Plus

1)Click on the icon in the browser’s toolbar.
2)Click on the toggle to disable it for "mrexcel.com".
Go back

Disable uBlock Origin

Follow these easy steps to disable uBlock Origin

1)Click on the icon in the browser’s toolbar.
2)Click on the "Power" button.
3)Click on the "Refresh" button.
Go back

Disable uBlock

Follow these easy steps to disable uBlock

1)Click on the icon in the browser’s toolbar.
2)Click on the "Power" button.
3)Click on the "Refresh" button.
Go back
Back
Top