Akuini

VBA Macro to create Word & Phrase Frequency

Sorry, for the late reply:
  • I assumed you only need 1 word frequency
  • Data start at row 2, you can change it in this part: For i = 2 To UBound(va, 1) 'data start at row 2
  • If you have stop words list then put the list starting at A1 in sheet2. The stop words will be removed from the list.
  • You need to manually sort data by col A (Divison)
  • Run Sub Word_Phrase_Frequency_v1, you will be asked to put the cursor at the proper column, so if you want to get frequency of "Question 1 Answers" (col B) then put the cursor in col B.
  • The result will be at 2 column on the right of the last column with data

VBA Code:
Option Explicit
Dim LC As Long 'last column
Dim CX As Long 'active cell column
Dim rSW As Range


Sub Word_Phrase_Frequency_v1()

'The code will generate word/phrase frequency
'How to use:
'1. Add reference to "Microsoft VBScript Regular Expressions 5.5" (you need to do it once only):
'   In Visual Basic Editor menu, select Tools –> References, then select Microsoft VBScript Regular Expressions 5.5, then click OK.
'2. Data must be in column A, start at A1
'3. Run Word_Phrase_Frequency_v1


'--- CHANGE sNumber & xPattern VALUE TO SUIT -----------------------------------

Const sNumber As String = "1,2,3"  '"1,2,3"
'sNumber = "1"  will generate 1 word frequency list
'sNumber = "1,2,3"  will generate 1 word, 2 word & 3 word frequency list

Const xPattern As String = "A-Z0-9_'"
'define the word characters, the above pattern will include letter, number, underscore & apostrophe as word character
'word with apostrophe such as "you're" counts as one word.
'word with underscore such as "aa_bb" counts as one word.


'Const xCol As String = "C:ZZ" 'columns to clear
Dim i As Long, j As Long, k As Long, h As Long
Dim txa As String, txq As String
Dim z, t, va, ary, arz
Dim rngA As Range
   
    Set rngA = Application.Selection
    Set rngA = Application.InputBox("Put the cursor at the proper column", "", rngA.Address, Type:=8)
    rngA.Activate

t = Timer

Application.ScreenUpdating = False

CX = ActiveCell.Column
LC = ActiveSheet.Cells.Find("*", SearchOrder:=xlByColumns, SearchDirection:=xlPrevious).Column
'if there are errors, remove them
On Error Resume Next
Columns(CX).SpecialCells(xlCellTypeFormulas, xlErrors).ClearContents
Columns(CX).SpecialCells(xlConstants, xlErrors).ClearContents
On Error GoTo 0

Cells(1, LC + 2) = Cells(1, CX)
Cells(2, LC + 2) = "Division"
Cells(2, LC + 3) = "WORDS"
Cells(2, LC + 4) = "COUNT"

j = Range("A" & Rows.Count).End(xlUp).Row

va = Range("A1:A" & j)

For i = 2 To UBound(va, 1)  'data start at row 2
 k = i
    Do
        i = i + 1
        If i > UBound(va, 1) Then Exit Do
    Loop While va(i, 1) = va(i - 1, 1)
    i = i - 1
'    Debug.Print j & " : " & i
     txq = txq & va(i, 1) & ":" & k & ":" & i & ","

Next

'Debug.Print txq
        ary = Split(txq, ",")
       
        For h = 0 To UBound(ary) - 1
            arz = Split(ary(h), ":")

                txa = Join(Application.Transpose(Range(Cells(arz(1), CX), Cells(arz(2), CX))), " ")
                'if you have stop words list then put the list starting at A1 in sheet2
                With Sheets("Sheet2")
                    Set rSW = .Range("A1", .Cells(.Rows.Count, "A").End(xlUp))
                End With

                If rSW.Cells(1) <> "" Then Call stopWord(xPattern, txa)
                Debug.Print txa
            Call toProcessY(1, txa, xPattern, CStr(arz(0)))
        Next


'Range(xCol).Columns.AutoFit
Application.ScreenUpdating = True
Cells(1, LC + 2).Activate
Debug.Print "It's done in:  " & Timer - t & " seconds"

End Sub

Sub toProcessY(n As Long, ByVal tx As String, xP As String, div As String)
'phrase frequency

Dim regEx As Object, matches As Object, x As Object, d As Object
Dim i As Long, LR As Long
Dim va, q

        Set regEx = CreateObject("VBScript.RegExp")
        With regEx
            .Global = True
            .MultiLine = True
            .ignorecase = True
        End With


    Set d = CreateObject("scripting.dictionary")
    d.CompareMode = vbTextCompare

'    regEx.Pattern = Trim(WorksheetFunction.Rept("[A-Z0-9_']+ ", n)) 'match n words (the phrase) separated by a space
    regEx.Pattern = Trim(WorksheetFunction.Rept("[" & xP & "]+ ", n)) 'match n words (the phrase) separated by a space
            Set matches = regEx.Execute(tx)
           
            For Each x In matches
                d(CStr(x)) = d(CStr(x)) + 1 'get phrase frequency
            Next
 
For i = 1 To n - 1
       
        regEx.Pattern = "^[" & xP & "]+ "
        If regEx.Test(tx) Then
           tx = regEx.Replace(tx, "")   'remove first word in each line to get different combination of n words (phrase)

'            regEx.Pattern = Trim(WorksheetFunction.Rept("[A-Z0-9_']+ ", n))
            regEx.Pattern = Trim(WorksheetFunction.Rept("[" & xP & "]+ ", n))
           
            Set matches = regEx.Execute(tx)
           
            For Each x In matches
                d(CStr(x)) = d(CStr(x)) + 1     'get phrase frequency
            Next

        End If
Next

'If d.Count = 0 Then MsgBox "Nothing with " & n & " word phrase found": Exit Sub
If d.Count = 0 Then Exit Sub

LR = Cells(Rows.Count, LC + 3).End(xlUp).Row + 1

'put the result
With Cells(LR, LC + 3).Resize(d.Count, 2)
    .Value = Application.Transpose(Array(d.Keys, d.Items))
    .Sort Key1:=.Cells(1, 2), Order1:=xlDescending, Key2:=.Cells(1, 1), Order2:=xlAscending, Header:=xlNo
    .Cells(1, 1).Offset(, -1) = div
End With




End Sub


Sub stopWord(xP As String, tx As String)
Dim n As Long
Dim stW, x
Dim regEx As Object
stW = rSW.Value
        Set regEx = CreateObject("VBScript.RegExp")
        With regEx
            .Global = True
            .MultiLine = True
            .ignorecase = True
        End With
tx = " " & tx
For Each x In stW
        regEx.Pattern = "[^" & xP & "]" & x & "[^" & xP & "]"
        If regEx.Test(tx) Then
           tx = regEx.Replace(tx, " ")   'replace stop word with " "
        End If
Next

End Sub

I've changed the example a bit:
darkbunty - word frequency.xlsm
ABCDEFGH
1DivisionQuestion 1 AnswersQuestion 2 AnswersQuestion 1 Answers
2AWe are oneDivisionWORDSCOUNT
3AThis is doneAone2
4Athis oneWhen you arrveThis2
5BAwesomedone1
6BNo questionWe1
7BNo changesBAwesome1
8CWhen you arrveNo1
9CWhen you arrvequestion1
10DNo changesCarrve1
11DNo changesWhen1
12ENo questionyou1
13ENo questionDchanges1
14No1
15
Sheet1


stop words:
darkbunty - word frequency.xlsm
AB
1and
2is
3are
4
5
Sheet2


W:love::love:W!!!!!! (y)(y):):)

First of all, please accept my apologies for the delayed response as I was working on something else and did not get a chance to come here and check your AMAZING WOKING CODE.

This is working PERFECT!!!!

FLAWLESS. You are an AMAZING person with magical working code.

Thank you once again for all your hard work and resolving this. Really appreciate it!!!!!!
 

Excel Facts

Round to nearest half hour?
Use =MROUND(A2,"0:30") to round to nearest half hour. Use =CEILING(A2,"0:30") to round to next half hour.
This is an amazing macro that I used to run without any issues. However, today I received the following message: Run time error "70": Permission Denied. It stops here
Set regEx = CreateObject("VBScript.RegExp"); I have a reference for MS Script Regex 5.5. Does somebody have an idea what happens? Thanks!
 
Actually, I was wrong about the reference, i.e. the code actually doesn’t need reference to "Microsoft VBScript Regular Expressions 5.5" because it uses late binding method. So it should work with or without the reference to that.
As for Run time error "70": Permission Denied:
1. Can you try this sub and see where it stops?

VBA Code:
Sub just_try()
Dim d As Object, q As Object

    Set d = CreateObject("scripting.dictionary")
            Set q = CreateObject("VBScript.RegExp")

End Sub

2. Any chances you're using Malwarebytes software? There's a report that somehow Malwarebytes disabled RegExp.
 
This is an amazing macro that I used to run without any issues. However, today I received the following message: Run time error "70": Permission Denied. It stops here
Set regEx = CreateObject("VBScript.RegExp"); I have a reference for MS Script Regex 5.5. Does somebody have an idea what happens? Thanks!
I forgot to quote your question in post #24.
 
Hi and THANK YOU for posting this amazing macro. It is so helpful in what I do.
I'm not super familiar with Regex and would love know how easy it would be to add the capability to ignore any word that was only 1 or 2 characters long as well as ignoring a specific list of certain words (maybe on a second tab of the workbook that I could continually update) like "are, was, she, her, his, can, has, had, any, all, out, for, the, and, can, you" etc. ?

Thanks!
-J!m
 
ignoring a specific list of certain words (maybe on a second tab of the workbook that I could continually update) like "are, was, she, her, his, can, has, had, any, all, out, for, the, and, can, you" etc. ?
What you're looking for is called stop words. I've written the code for this, please check post 4-8.

to add the capability to ignore any word that was only 1 or 2 characters long
I'll try to amend the code to do that when I have time. I'm kind of busy right now.
 
What you're looking for is called stop words. I've written the code for this, please check post 4-8.
Thanks, found the updated code... Stop Words works GREAT! Exactly what I needed to upgrade this from Amazing to Spectacular...

For now I can add a few 1 and 2 letter common words to the Stop Word list and that will work great!

Thanks again for such an Amazing Macro.
 
I'll try to amend the code to do that when I have time. I'm kind of busy right now.
I think I figured it out. I took your stopWord code and modified it to remove 1 and 2 letter words as well as a bunch of super common words I don't care about. I call this routine right before your stopWords is called and it's working great. I'll still be using stopWords also so the user can add in some of their own words they don't want.

VBA Code:
Sub commonWord(tx As String)
    
    Const commonWords As String = ".,..,the,and,you,was,for,are,his,one,had,but,not,all,can,use,she,how,out,her,him,has,two,see,way,who,oil,its,now,day,did,get,may,that,with,they,this,have,from,word,what,were,when,your,said,each,will,many,then,them,some,make,like,into,time,look,more,than,been,call,find,long,down,come,made,part,there,which,their,other,about,these,would,write,could,first,water,number,people"
    'Define list of common Stop words to Ignore
    '. and .. will remove all 1 and 2 character words and numbers

    Dim regEx  As Object
    Dim w As String
    Dim c, i
    
    Set regEx = CreateObject("VBScript.RegExp")
    With regEx
        .Global = True
        .MultiLine = True
        .IgnoreCase = True
    End With
    
    c = Split(commonWords, ",")
    tx = " " & tx
    For i = LBound(c) To UBound(c)
        regEx.Pattern = "[^A-Z]" & c(i) & "[^A-Z]"
        If regEx.Test(tx) Then
            tx = regEx.Replace(tx, "|")           'replace stop word with "|"
        End If
    Next
    
End Sub
 
This code has been really useful. However, is there a way to limit it from running into the next cell in the column when counting occurences of phrases? I have a list of free text entries (60k rows), and wish to identify the most frequent 3-word and 4-word groups. At the moment, the return is skewed as the analysis includes the cell below. For example, the following cells would return the phrase "replacement required replacement".

faulty replacement required
replacement item ordered
Loan item sent to customer

I appreciate this thread is old, but it is a really useful piece of code and your assistance would be greatly appreciated.
 

Forum statistics

Threads
1,223,958
Messages
6,175,636
Members
452,662
Latest member
Aman1997

We've detected that you are using an adblocker.

We have a great community of people providing Excel help here, but the hosting costs are enormous. You can help keep this site running by allowing ads on MrExcel.com.
Allow Ads at MrExcel

Which adblocker are you using?

Disable AdBlock

Follow these easy steps to disable AdBlock

1)Click on the icon in the browser’s toolbar.
2)Click on the icon in the browser’s toolbar.
2)Click on the "Pause on this site" option.
Go back

Disable AdBlock Plus

Follow these easy steps to disable AdBlock Plus

1)Click on the icon in the browser’s toolbar.
2)Click on the toggle to disable it for "mrexcel.com".
Go back

Disable uBlock Origin

Follow these easy steps to disable uBlock Origin

1)Click on the icon in the browser’s toolbar.
2)Click on the "Power" button.
3)Click on the "Refresh" button.
Go back

Disable uBlock

Follow these easy steps to disable uBlock

1)Click on the icon in the browser’s toolbar.
2)Click on the "Power" button.
3)Click on the "Refresh" button.
Go back
Back
Top