Extract all the links of a website

I entered the cells highlighted in gray for you.
Would it be possible to proceed step by step?
Initially you might:
1) make sure that in sheet 1 the macro (contained in the file and which extracts all the links of the website for each line) does not process all the lines together but that the processing takes place one line at a time with saving and the possibility of blocking the macro. When I start the macro at a later time it must resume from the first unprocessed line.
2) insertion of the button (cell highlighted in gray) where the code moves the links of sheet 1 in the respective columns based on the column header. I would like to understand what happens if I found more links containing the word in the header (you will probably need to prefer the one that contains it at the end of the link).
As I indicated for the extraction of the link of the facebook page, if it is not useful to move the links from sheet 1 you could use the same procedure used by the code contained in the other "regex" file.
3) sheet 3 inserting the button to extrapolate the site name from each url (as you did in the last file you attached).
For the extraction of emails and telephone numbers you can do it after checking how it proceeds with the other functions.
My goal is to be able to extract certain contacts (emails and telephone numbers) present in specific web pages of the website (homepage, contact page, other pages such as marketing office, etc.). The composition of the link of a web page normally reflects the content of the web page, so the contact web page will have the word contacts etc. within it.
 
Upvote 0

Excel Facts

Can Excel fill bagel flavors?
You can teach Excel a new custom list. Type the list in cells, File, Options, Advanced, Edit Custom Lists, Import, OK
The code below is for the button at sheet 1. It makes the URL bold after processing it, and on the next execution will only work on non-bold entries.

VBA Code:
Private Sub CommandButton1_Click()
Const colUrl As Long = 1      'Must always be the first column
Dim url$, http As Object, htmlDoc As Object, nodeAllLinks As Object, onelink As Object, loadOK As Boolean, _
tbl_url_oal$, tbl_all As String, cr_tbl_urls As Long, lastRowTableUrls&, cr_table_all(1 To 3) As Long, _
lastRowTableAll&, cc As Long, cel As Range
tbl_url_oal = "foglio1"     'Name of Sheet
cr_tbl_urls = 2           'First row for content
tbl_all = "foglio1"
Sheets(tbl_url_oal).Activate
For cc = colUrl To 3
    cr_table_all(cc) = 2
Next
Set htmlDoc = CreateObject("htmlfile")
Set http = CreateObject("MSXML2.ServerXMLHTTP.6.0")
Do While Sheets(tbl_url_oal).Cells(cr_tbl_urls, 1) <> ""
    If Not Sheets(tbl_url_oal).Cells(cr_tbl_urls, 1).Font.Bold Then
        If ActiveSheet.Name = tbl_url_oal Then Sheets(tbl_url_oal).Cells(cr_tbl_urls, 1).Activate
        url = Sheets(tbl_url_oal).Cells(cr_tbl_urls, 1)
        On Error Resume Next
        http.Open "GET", url, False
        http.send
        If Err.Number = 0 Then loadOK = True
        On Error GoTo 0
        If loadOK Then
            htmlDoc.body.innerHTML = http.responseText
            Set nodeAllLinks = htmlDoc.getElementsByTagName("a")
            cr_table_all(1) = Sheets(tbl_all).Range("b" & Rows.Count).End(xlUp).Row + 1
            For Each onelink In nodeAllLinks
                DoEvents
                Sheets(tbl_all).Cells(cr_table_all(1), 2) = _
                Right(onelink.href, Len(onelink.href) - InStr(onelink.href, ":"))
                Sheets(tbl_all).Cells(cr_table_all(1), 3) = url
                cr_table_all(1) = cr_table_all(1) + 1
            Next
        End If
        loadOK = False
        lastRowTableAll = Sheets(tbl_all).Cells(Rows.Count, colUrl).End(xlUp).Row
        For cc = colUrl To 3
            cr_table_all(cc) = lastRowTableAll + 1
        Next
        Sheets(tbl_url_oal).Cells(cr_tbl_urls, 1).Font.Bold = True
    End If
    cr_tbl_urls = cr_tbl_urls + 1
Loop
Set http = Nothing: Set htmlDoc = Nothing
Set nodeAllLinks = Nothing
Set onelink = Nothing
End Sub
 
Upvote 0
The code below is for the button at sheet 1. It makes the URL bold after processing it, and on the next execution will only work on non-bold entries.

VBA Code:
Private Sub CommandButton1_Click()
Const colUrl As Long = 1      'Must always be the first column
Dim url$, http As Object, htmlDoc As Object, nodeAllLinks As Object, onelink As Object, loadOK As Boolean, _
tbl_url_oal$, tbl_all As String, cr_tbl_urls As Long, lastRowTableUrls&, cr_table_all(1 To 3) As Long, _
lastRowTableAll&, cc As Long, cel As Range
tbl_url_oal = "foglio1"     'Name of Sheet
cr_tbl_urls = 2           'First row for content
tbl_all = "foglio1"
Sheets(tbl_url_oal).Activate
For cc = colUrl To 3
    cr_table_all(cc) = 2
Next
Set htmlDoc = CreateObject("htmlfile")
Set http = CreateObject("MSXML2.ServerXMLHTTP.6.0")
Do While Sheets(tbl_url_oal).Cells(cr_tbl_urls, 1) <> ""
    If Not Sheets(tbl_url_oal).Cells(cr_tbl_urls, 1).Font.Bold Then
        If ActiveSheet.Name = tbl_url_oal Then Sheets(tbl_url_oal).Cells(cr_tbl_urls, 1).Activate
        url = Sheets(tbl_url_oal).Cells(cr_tbl_urls, 1)
        On Error Resume Next
        http.Open "GET", url, False
        http.send
        If Err.Number = 0 Then loadOK = True
        On Error GoTo 0
        If loadOK Then
            htmlDoc.body.innerHTML = http.responseText
            Set nodeAllLinks = htmlDoc.getElementsByTagName("a")
            cr_table_all(1) = Sheets(tbl_all).Range("b" & Rows.Count).End(xlUp).Row + 1
            For Each onelink In nodeAllLinks
                DoEvents
                Sheets(tbl_all).Cells(cr_table_all(1), 2) = _
                Right(onelink.href, Len(onelink.href) - InStr(onelink.href, ":"))
                Sheets(tbl_all).Cells(cr_table_all(1), 3) = url
                cr_table_all(1) = cr_table_all(1) + 1
            Next
        End If
        loadOK = False
        lastRowTableAll = Sheets(tbl_all).Cells(Rows.Count, colUrl).End(xlUp).Row
        For cc = colUrl To 3
            cr_table_all(cc) = lastRowTableAll + 1
        Next
        Sheets(tbl_url_oal).Cells(cr_tbl_urls, 1).Font.Bold = True
    End If
    cr_tbl_urls = cr_tbl_urls + 1
Loop
Set http = Nothing: Set htmlDoc = Nothing
Set nodeAllLinks = Nothing
Set onelink = Nothing
End Sub
Excuse me if I ask you again.
You could enter the file with the code inside.
Thank you
 
Upvote 0
I think you should try to create the button yourself, here is the procedure:

  • Ribbon > developer > Insert > ActiveX command button
  • Draw the button > right click > view code > paste the code
  • Ribbon > exit design mode > click button to run code.
 
Upvote 0
This is the code for the second button:

VBA Code:
Private Sub CommandButton1_Click()
Dim fog As Worksheet, c%, r As Range
Rem code for foglio 2
Set fog = Sheets("foglio1")
fog.[b1] = "header"
fog.[d1] = fog.[b1]
For c = 2 To Me.[2:2].Find("*", SearchOrder:=xlByColumns, SearchDirection:=2).Column
    fog.[d2] = "*" & Me.Cells(2, c) & "*"
    fog.[b:b].AdvancedFilter Action:=xlFilterCopy, CriteriaRange:=fog.Range("d1:d2"), _
    CopyToRange:=fog.[f1], Unique:=True
    Set r = [f1].CurrentRegion
    Set r = fog.Range(fog.Cells(2, 6), fog.Cells(r.Rows.Count, 6))
    If Len(fog.[f2]) Then r.Copy Me.Cells(3, c)
    r.Delete
Next
End Sub
 
Upvote 0
This is the code for the second button:

VBA Code:
Private Sub CommandButton1_Click()
Dim fog As Worksheet, c%, r As Range
Rem code for foglio 2
Set fog = Sheets("foglio1")
fog.[b1] = "header"
fog.[d1] = fog.[b1]
For c = 2 To Me.[2:2].Find("*", SearchOrder:=xlByColumns, SearchDirection:=2).Column
    fog.[d2] = "*" & Me.Cells(2, c) & "*"
    fog.[b:b].AdvancedFilter Action:=xlFilterCopy, CriteriaRange:=fog.Range("d1:d2"), _
    CopyToRange:=fog.[f1], Unique:=True
    Set r = [f1].CurrentRegion
    Set r = fog.Range(fog.Cells(2, 6), fog.Cells(r.Rows.Count, 6))
    If Len(fog.[f2]) Then r.Copy Me.Cells(3, c)
    r.Delete
Next
End Sub
thank you
 
Upvote 0
The third button:

VBA Code:
Rem foglio 3
Private Sub CommandButton1_Click()
Dim cel As Range
With New XMLHTTP60
    On Error Resume Next
    For Each cel In Me.Range("A4:A" & Me.Cells(Rows.Count, 1).End(xlUp).Row)
        .Open "GET", cel.Value, False
        .send
        If .Status = 200 Then cel.Offset(, 1) = _
        Split(Split(.responseText, "<title>")(1), "</")(0)
    Next
End With
On Error GoTo 0
End Sub
 
Upvote 0
The email code:

VBA Code:
Rem foglio 3
Private Sub CommandButton2_Click()
Dim htmldoc As Object, http, url$, loadok, col As Collection, i%, j%, laste%
laste = 4
Set htmldoc = CreateObject("htmlfile")
Set http = CreateObject("MSXML2.ServerXMLHTTP.6.0")
For i = 4 To Me.Range("a" & Me.Rows.Count).End(xlUp).Row
    url = Me.Cells(i, 1)
    On Error Resume Next
    http.Open "GET", url, False
    http.send
    If Err.Number = 0 Then loadok = True
    On Error GoTo 0
    If loadok Then
        htmldoc.body.innerHTML = http.responseText
        Set col = GMAd(htmldoc)
        For j = 1 To col.Count
            Me.Cells(laste, 5) = col(j)
            laste = laste + 1
        Next
    End If
Next
End Sub

Function GMAd(ByVal hdoc As Object) As Collection
Dim opcol As Collection, emm As Object, mfound As Object, regEx As Object
Set opcol = New Collection
Set regEx = CreateObject("VBScript.RegExp")
With regEx
    .Pattern = "([a-zA-Z0-9_\-\.]+)@((\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.)|" & _
    "(([a-zA-Z0-9\-]+\.)+))([a-zA-Z]{2,4}|[0-9]{1,3})(\]?)"
    .Global = True
    Set emm = .Execute(hdoc.body.innerHTML)
End With
For Each mfound In emm
    On Error Resume Next
    opcol.Add mfound.Value, Key:=mfound.Value
    On Error GoTo 0
Next
Set GMAd = opcol
End Function
 
Upvote 0

Forum statistics

Threads
1,223,911
Messages
6,175,325
Members
452,635
Latest member
laura12345

We've detected that you are using an adblocker.

We have a great community of people providing Excel help here, but the hosting costs are enormous. You can help keep this site running by allowing ads on MrExcel.com.
Allow Ads at MrExcel

Which adblocker are you using?

Disable AdBlock

Follow these easy steps to disable AdBlock

1)Click on the icon in the browser’s toolbar.
2)Click on the icon in the browser’s toolbar.
2)Click on the "Pause on this site" option.
Go back

Disable AdBlock Plus

Follow these easy steps to disable AdBlock Plus

1)Click on the icon in the browser’s toolbar.
2)Click on the toggle to disable it for "mrexcel.com".
Go back

Disable uBlock Origin

Follow these easy steps to disable uBlock Origin

1)Click on the icon in the browser’s toolbar.
2)Click on the "Power" button.
3)Click on the "Refresh" button.
Go back

Disable uBlock

Follow these easy steps to disable uBlock

1)Click on the icon in the browser’s toolbar.
2)Click on the "Power" button.
3)Click on the "Refresh" button.
Go back
Back
Top