'---------------------------------------------------------------------------------------
' Module : CountUniqueWordsInRange
' Author : JoeMo
' Date : 3/28/2013
' Purpose : Run from the activesheet. Assumes all data are in column A of the activesheet.
' Requires that words are separated by the space character.
' Returns all unique words with a count of the number of occurrences of
' each word in column A to a new sheet named "Unique Words".
' Limit on total word count that can be handled
' is 17,179,869,184 (Excel 2007 or later versions). Limit on number of
' unique words that can be handled is 1,048,575 (Excel 2007 or later versions).
' Treats numbers and acronyms as words.
'---------------------------------------------------------------------------------------
Sub CountUniqueWordsInRange()
Dim rS As Range, sSht As Worksheet, dSht As Worksheet, aSht As Worksheet
Dim Punc As Variant, lRs As Long, lRd As Long, c As Range
Dim totWords As Long, colCt As Long, vA As Variant, vO() As Variant
Dim i As Long, j As Long, k As Long, Ct As Long, n As Long
'define source range
Set sSht = ActiveSheet
lRs = sSht.Range("A" & Rows.Count).End(xlUp).Row
Set rS = sSht.Range("A1", "A" & lRs)
'Get total word count
totWords = CountWords(rS)
'determine how many columns needed to list all words
colCt = WorksheetFunction.RoundUp(totWords / Rows.Count, 0)
If colCt > Columns.Count Then
MsgBox "Too many words to list in one sheet - truncate the input range and try again." & vbNewLine & "Goodbye."
Exit Sub
End If
With Application
.ScreenUpdating = False
.Calculation = xlCalculationManual
.DisplayAlerts = False
.StatusBar = "PROCESSING YOUR DATA - PLEASE BE PATIENT"
End With
'Add sheet to list all words in source range
On Error Resume Next
Worksheets("All Words").Delete
On Error GoTo 0
ActiveWorkbook.Sheets.Add after:=sSht
Set aSht = ActiveSheet
aSht.Name = "All Words"
'list all words
Punc = Array(".", ",", ";", ":", "?", "!", "~", "@", "#", "$", _
"(", ")", "/", Chr(34), Chr(147), Chr(148))
For Each c In rS
If Not IsEmpty(c) Then
vA = Split(Trim(c.Value), " ")
Ct = Ct + UBound(vA) + 1
ReDim Preserve vO(1 To Ct)
For i = LBound(vA) To UBound(vA)
For j = LBound(Punc) To UBound(Punc)
vA(i) = Replace(vA(i), Punc(j), "")
Next j
Next i
For j = LBound(vA) To UBound(vA)
k = k + 1
vO(k) = vA(j)
Next j
End If
Next c
'put all words into All Words sheet
k = 0
n = 0
For i = 1 To colCt
Do Until n = aSht.Rows.Count Or k = UBound(vO)
k = k + 1
n = n + 1
aSht.Cells(n, i).Value = vO(k)
Loop
n = 0
Next i
'copy all words to sheet "Unique Words" and remove duplicates
On Error Resume Next
Worksheets("Unique Words").Delete
On Error GoTo 0
aSht.Copy after:=sSht
Set dSht = ActiveSheet
dSht.Name = "Unique Words"
For i = 1 To colCt
dSht.Range("A1").CurrentRegion.Columns(i).RemoveDuplicates Columns:=1, Header:=xlNo
Next i
'Get word count remaining after dups removal from individual columns
totWords = CountWords(dSht.Range("A1").CurrentRegion)
If totWords > dSht.Rows.Count Then
MsgBox "Too many words remaining for a single column after first pass - Goodbye."
Exit Sub
End If
'Consolidate columns and remove dups again
With dSht
lRd = .Range("A" & Rows.Count).End(xlUp).Row + 1
For i = 2 To colCt
.Range(Cells(1, i), Cells(Rows.Count, i).End(xlUp)).Cut Destination:=Cells(lRd, 1)
Next i
End With
'Final dups removal from the one remaining column
dSht.Range("A1").CurrentRegion.RemoveDuplicates Columns:=1, Header:=xlNo
dSht.Range("A1").EntireRow.Insert
With dSht.Range("A1:B1")
.Value = Array("Word", "Count")
.Font.Bold = True
End With
'Get count of each unique word
lRd = dSht.Range("A" & Rows.Count).End(xlUp).Row
dSht.Range("B2").FormulaR1C1 = "=COUNTIF('All Words'!C[-1]:C[" & colCt - 2 & "],'Unique Words'!RC[-1])"
With dSht.Range("B2", "B" & lRd)
.FillDown
.Calculate
.Copy
.PasteSpecial xlPasteValues
Application.CutCopyMode = False
End With
With dSht.Range("A1:B1")
.EntireColumn.AutoFit
End With
With Application
.ScreenUpdating = True
.Calculation = xlCalculationAutomatic
.DisplayAlerts = True
.StatusBar = False
End With
End Sub
Function CountWords(R As Range) As Long
Dim lChars As Long, c As Range, Ct As Long
For Each c In R
Ct = 0
lChars = Len(Trim(c.Value))
If lChars = 0 Then
Ct = 0
Else
Ct = Len(Trim(c.Value)) - Len(Replace(Trim(c.Value), " ", "")) + 1
End If
CountWords = CountWords + Ct
Next c
End Function