Add Email RegXP to Email Extraction code

Sharid

Well-known Member
Joined
Apr 22, 2007
Messages
1,066
Office Version
  1. 2016
Platform
  1. Windows
I am trying to link the below function with the rest of my code so it uses Regxp to extract emails, however I am struggling. Currently it extracts everything into column B as shown in the image. It should however be extracting emails only.

I am ONLY stuck on the email part below the rest of the code is fine. I am Not sure how to link "colEmails" to the Email part of the code. I have made sevel attempts nothing has worked I have left some of my attempts in the code they are commented out

The Following two line do extract emails, however they are not very affective. Either one works. I ahve commeted them out for now
VBA Code:
     'If InStr(1, nodeOneLink.href, "@") Then
     'If InStr(1, nodeOneLink.href, "mailto:") Then

Use Link to download demo Download File

Capture.JPG


VBA Code:
'Option Explicit
Private Sub CommandButton1_Click()
 'Columns for both tables
    Const colUrl As Long = 1      'Must always be the first column
    Const colmail As Long = 2     'Must always be the first column before Some platforms
    Const colFacebook As Long = 3 'Must always be the last column of Some platforms
    Const colError As Long = 4    'Must always be the last column

    Dim url As String
    Dim http As Object
    Dim htmlDoc As Object
    Dim nodeAllLinks As Object
    Dim nodeOneLink As Object
    Dim pageLoadSuccessful As Boolean
    Dim tableUrlsOneAddressLeft As String
    Dim tableAllAddresses As String
    Dim currentRowTableUrls As Long
    Dim lastRowTableUrls As Long
    Dim currentRowsTableAll(colUrl To colFacebook) As Long
    Dim lastRowTableAll As Long
    Dim addressCounters(colmail To colFacebook) As Long
    Dim checkCounters As Long

    'Initialize variables
    tableUrlsOneAddressLeft = "Sheet2"  'Name of Sheet
    currentRowTableUrls = 2           'First row for content
    tableAllAddresses = "Sheet1"     'Name of Sheet
    
    For checkCounters = colUrl To colFacebook
        currentRowsTableAll(checkCounters) = 2   'First rows for content
    Next checkCounters
    
    Set htmlDoc = CreateObject("htmlfile")
    Set http = CreateObject("MSXML2.ServerXMLHTTP.6.0")
  
    'Clear all contents and comments in the URL source sheet from email column to error column
    With Sheets(tableUrlsOneAddressLeft)
        lastRowTableUrls = .Cells(Rows.Count, colUrl).End(xlUp).Row
        .Range(.Cells(currentRowTableUrls, colmail), .Cells(lastRowTableUrls, colError)).ClearContents
        .Range(.Cells(currentRowTableUrls, colmail), .Cells(lastRowTableUrls, colError)).ClearComments
    End With
  
    'Delete all rows except headline in the sheet with all addresses
    lastRowTableAll = Sheets(tableAllAddresses).Cells(Rows.Count, colUrl).End(xlUp).Row
    Sheets(tableAllAddresses).Rows(currentRowsTableAll(colUrl) & ":" & lastRowTableAll).Delete Shift:=xlUp
  
    'Loop over all URLs in column A in the URL source sheet
    Do While Sheets(tableUrlsOneAddressLeft).Cells(currentRowTableUrls, 1).Value <> ""
        'Scroll for visual monitoring, if 'the sheet with the URLs are the
        'active one
        If ActiveSheet.Name = tableUrlsOneAddressLeft Then
            If currentRowTableUrls > 14 Then
                ActiveWindow.SmallScroll down:=1
            End If
            Sheets(tableUrlsOneAddressLeft).Cells(currentRowTableUrls, 1).Select
        End If
    
        'Get next url from the URL source sheet
        url = Sheets(tableUrlsOneAddressLeft).Cells(currentRowTableUrls, colUrl).Value
    
        'Try to load page 'Temporarily disable error handling if 'there is a timeout or onother error
        On Error Resume Next
        http.Open "GET", url, False
        http.Send
    
        'Check if page loading was successful
        If Err.Number = 0 Then
            pageLoadSuccessful = True
        End If
        On Error GoTo 0
    
        If pageLoadSuccessful Then
            'Build html document for DOM operations
            htmlDoc.body.innerHTML = http.responseText
            'Create node list from all links of the page
            Set nodeAllLinks = htmlDoc.getElementsByTagName("a")
            'Walk through all links of the node list
      
    For Each nodeOneLink In nodeAllLinks
               
'''#####################################################################################################
'''################################### THIS IS THE START OF THE EMAIL SECTION ##########################
'''#####################################################################################################
DoEvents
'Dim colEmails As Collection
Set colEmails = GetEmailAddressesFromHtml(htmlDoc)
'If InStr(1, colmails) Then
'If colEmails.Length > 0 Then
'If colEmails Length > 0 Then
'If InStr(1, nodeOneLink.href, "@") Then
'If InStr(1, emailMatches) Then

'Check for mail address
    'If InStr(1, nodeOneLink.href, "@") Then
    'If InStr(1, nodeOneLink.href, "mailto:") Then
            'Write mail address to both tables
            Sheets(tableUrlsOneAddressLeft).Cells(currentRowTableUrls, colmail).Value = Right(nodeOneLink.href, Len(nodeOneLink.href) - InStr(nodeOneLink.href, ":"))
            Sheets(tableAllAddresses).Cells(currentRowsTableAll(colmail), colmail).Value = Right(nodeOneLink.href, Len(nodeOneLink.href) - InStr(nodeOneLink.href, ":"))
            'Check if it is a new line in the sheet with all addresses
    If currentRowsTableAll(colmail) >= currentRowsTableAll(colUrl) Then
            'Write URL in the new line of the sheet with all addresses
            Sheets(tableAllAddresses).Cells(currentRowsTableAll(colUrl), colUrl).Value = url
            'Increment url counter
            currentRowsTableAll(colUrl) = currentRowsTableAll(colUrl) + 1
    End If
            'Increment mail counters
             currentRowsTableAll(colmail) = currentRowsTableAll(colmail) + 1
            addressCounters(colmail) = addressCounters(colmail) + 1
    'End If
  
'''#####################################################################################################
'''################################### END OF THE EMAIL SECTION ########################################
'''#####################################################################################################

        'Check for Facebook address
        If InStr(1, UCase(nodeOneLink.href), "FACEBOOK") Then
            'Write Facebook address to both tables
            Sheets(tableUrlsOneAddressLeft).Cells(currentRowTableUrls, colFacebook).Value = nodeOneLink.href
            Sheets(tableAllAddresses).Cells(currentRowsTableAll(colFacebook), colFacebook).Value = nodeOneLink.href
            'Check if it is a new line in the sheet with all addresses
        If currentRowsTableAll(colFacebook) >= currentRowsTableAll(colUrl) Then
            'Write URL in the new line of the sheet with all addresses
             Sheets(tableAllAddresses).Cells(currentRowsTableAll(colUrl), colUrl).Value = url
            'Increment url counter
            currentRowsTableAll(colUrl) = currentRowsTableAll(colUrl) + 1
        End If
            'Increment Facebook counters
            currentRowsTableAll(colFacebook) = currentRowsTableAll(colFacebook) + 1
            addressCounters(colFacebook) = addressCounters(colFacebook) + 1
        End If

Next nodeOneLink

        'Check address counters
        For checkCounters = colmail To colFacebook
        'Set comment if more than 1 link were found
        If addressCounters(checkCounters) > 1 Then
            Sheets(tableUrlsOneAddressLeft).Cells(currentRowTableUrls, checkCounters).AddComment Text:=CStr(addressCounters(checkCounters))
            Sheets(tableUrlsOneAddressLeft).Cells(currentRowTableUrls, checkCounters).Comment.Shape.TextFrame.AutoSize = True
        End If
Next checkCounters
        Else
        'Page not loaded
        'Write message URL table
            Sheets(tableUrlsOneAddressLeft).Cells(currentRowTableUrls, colError).Value = "Error with URL or timeout"
        End If
    
        'Prepare for next page
        pageLoadSuccessful = False
        Erase addressCounters
        lastRowTableAll = Sheets(tableAllAddresses).Cells(Rows.Count, colUrl).End(xlUp).Row
        For checkCounters = colUrl To colFacebook
            currentRowsTableAll(checkCounters) = lastRowTableAll + 1 'First rows for next page content
        Next checkCounters
        currentRowTableUrls = currentRowTableUrls + 1
    Loop
  
 'Clean Up
     Set http = Nothing
    Set htmlDoc = Nothing
    Set nodeAllLinks = Nothing
    Set nodeOneLink = Nothing

End Sub

Function

VBA Code:
Private Function GetEmailAddressesFromHtml(ByVal htmlDocument As Object) As Collection
    ' Should return a collection of strings representing email addresses detected
    ' in the HTML document.
   
    Dim outputCollection As Collection
    Set outputCollection = New Collection
   
    Dim regEx As Object
    Set regEx = CreateObject("VBScript.RegExp")
   
    With regEx
       .Pattern = "([a-zA-Z0-9_\-\.]+)@((\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.)|(([a-zA-Z0-9\-]+\.)+))([a-zA-Z]{2,4}|[0-9]{1,3})(\]?)"
       ' .Pattern = "[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+"
        .Global = True
        Dim emailMatches As Object
        Set emailMatches = .Execute(htmlDocument.body.innerHTML)
    End With
   
    Dim matchFound As Object
    For Each matchFound In emailMatches
        On Error Resume Next ' De-duplicate here.
        outputCollection.Add matchFound.Value, Key:=matchFound.Value
        On Error GoTo 0
    Next matchFound
   
    Set GetEmailAddressesFromHtml = outputCollection
End Function
 

Excel Facts

What does custom number format of ;;; mean?
Three semi-colons will hide the value in the cell. Although most people use white font instead.
This is the correct download link Here as it has urls in sheet2. Above download link is without urls
 
Upvote 0
I am really stuck on this, if anyone can help that would be super
 
Upvote 0
I have put together a simpler version of my code, this time I have got the regxp to work but can not extract the social media part to extract. Can anyone advise, Thanks for having a look
1610555365586.png

VBA Code:
Private Sub CommandButton1_Click()
'Add "Start" To Sheet
Sheet6.Range("H1").Value = "Start"

'If "Start" run this
If Sheet6.Range("H1").Value = "Start" Then

Dim wb As Workbook
Dim wsSheet As Worksheet, links As Variant, link As Variant
Dim rw As Long
Dim regxp As New RegExp, post As Object, email_list As Object
Dim StartRow As Long
Dim EndRow As Long
Dim varLinks As Variant
Dim Counter As Long
Dim Html As New HTMLDocument
On Error GoTo ErrHandler


''''SHEET1 as sheet with URL
    Set wb = ThisWorkbook
    Set wsSheet = wb.Sheets("Collection")
   

'''' start and end rows
    StartRow = wsSheet.Cells(wsSheet.Rows.Count, "B").End(xlUp).row + 1
    EndRow = wsSheet.Cells(wsSheet.Rows.Count, "A").End(xlUp).row
    varLinks = WorksheetFunction.Transpose(wsSheet.Range("A" & StartRow & ":A" & EndRow))
    
    Counter = 0
    For Each link In varLinks
DoEvents
    
      Set Html = NewHTMLDocument(CStr(link))
With regxp
''' ########## Email Pattern ###########
        .Pattern = "([a-zA-Z0-9_\-\.]+)@((\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.)|(([a-zA-Z0-9\-]+\.)+))([a-zA-Z]{2,4}|[0-9]{1,3})(\]?)"
        .Global = False
        .IgnoreCase = True
        Set email_list = .Execute(Html.body.innerHTML)
 DoEvents

 End With

DoEvents
        Set Html = NewHTMLDocument(CStr(link))
        On Error Resume Next
        If email_list(0) Is Nothing Then
            wsSheet.Cells(StartRow + Counter, 2).Value = "-"
        Else
         On Error Resume Next
            wsSheet.Cells(StartRow + Counter, 2).Value = email_list(0)
        End If
       
'########################### NOT WORKING #############################
'Social Links
        Set links = Html.getElementsByTagName("a")
        If InStr(UCase(link.outerHTML), "FACEBOOK") Is Nothing Then
            wsSheet.Cells(StartRow + Counter, 3).Value = "-"
        Else
             wsSheet.Cells(StartRow + Counter, 3).Value = link.href
            'wsSheet.Cells(StartRow + Counter, 3).Value = link.outerHTML
        End If
'########################### NOT WORKING #############################
        Counter = Counter + 1
 
'Stop the code
    If Sheet6.Range("H1").Value = "Stopped" Then
        MsgBox "You Have Stopped The Process"
        Exit Sub
    End If
      
Next link


Exit Sub
ErrHandler:
errtext = "Err#: " & Err.Number & vbNewLine & "Desc: " & Err.Description
Debug.Print errtext
MsgBox errtext

End If
End Sub

Function

VBA Code:
Public Function NewHTMLDocument(strURL As String) As Object
    Dim objHTTP As Object, objHTML As Object, strTemp As String
    Set objHTTP = CreateObject("MSXML2.ServerXMLHTTP")
    objHTTP.Open "GET", strURL, False
    objHTTP.send
    If objHTTP.Status = 200 Then
        strTemp = objHTTP.responseText
    Set objHTML = CreateObject("htmlfile")
    objHTML.body.innerHTML = strTemp
    Set NewHTMLDocument = objHTML
Else
End If
End Function

VBA Code:
Private Sub CommandButton2_Click()
'Add "Stopped" to sheet to exit code
Sheet6.Range("H1").Value = "Stopped"
End Sub
 
Upvote 0
Hi Sharid
Do you still need help with this code/question? I'm concious that it's been 26+ days since you've last requested assistance, but I've just had a look at the workbook you've posted and just wanted to see if you've since progressed from here in the interim period. If not, I have some comments I can make on the code that might help.
 
Upvote 0
Thanks Dan,

I managed to work it out in the end, I can now use the Regxp and Mailto , so its no longer an issue
 
Upvote 0
Great. Seems like your project is coming together.
 
Upvote 0

Forum statistics

Threads
1,223,898
Messages
6,175,274
Members
452,628
Latest member
dd2

We've detected that you are using an adblocker.

We have a great community of people providing Excel help here, but the hosting costs are enormous. You can help keep this site running by allowing ads on MrExcel.com.
Allow Ads at MrExcel

Which adblocker are you using?

Disable AdBlock

Follow these easy steps to disable AdBlock

1)Click on the icon in the browser’s toolbar.
2)Click on the icon in the browser’s toolbar.
2)Click on the "Pause on this site" option.
Go back

Disable AdBlock Plus

Follow these easy steps to disable AdBlock Plus

1)Click on the icon in the browser’s toolbar.
2)Click on the toggle to disable it for "mrexcel.com".
Go back

Disable uBlock Origin

Follow these easy steps to disable uBlock Origin

1)Click on the icon in the browser’s toolbar.
2)Click on the "Power" button.
3)Click on the "Refresh" button.
Go back

Disable uBlock

Follow these easy steps to disable uBlock

1)Click on the icon in the browser’s toolbar.
2)Click on the "Power" button.
3)Click on the "Refresh" button.
Go back
Back
Top