PC_Meister
Board Regular
- Joined
- Aug 28, 2013
- Messages
- 72
Hello,
I am sorry in advance if I have picked the wrong words for the title of thread (which i think is the case, since web search returned no relevant results). Basically I have a column with company names where the company names were not standardized. Meaning you will see for example, "Microsoft Corp.", "Microsoft Corpo", "Microsoft Corp" or "Hewlett Packard Corp", "Hewlett-Packard Corp." etc. This is annoying in particular when trying to do some metrics and Pivot Graphs because you will see various data points for the same company.
So what I thought of doing is to use Approximate string matching to unify the list by having one name for each company. I wrote a little code shown below, I am close but not quite there yet. I am wondering if anybody has better suggestions.
Please note that in my case I don't really care which name to be used as long as there is only one per company
Sample output
[TABLE="class: grid, width: 286"]
<colgroup><col><col></colgroup><tbody></tbody>[/TABLE]
[TABLE="class: grid, width: 286, align: left"]
<colgroup><col><col></colgroup><tbody>[TR]
[TD]PreProcessing
[/TD]
[TD]AfterProcessing[/TD]
[/TR]
[TR]
[TD]Microsoft Corp.[/TD]
[TD]Microsoft Cor,[/TD]
[/TR]
[TR]
[TD]Microsoft Corpo[/TD]
[TD]Microsoft Cor,[/TD]
[/TR]
[TR]
[TD]Microsoft Corp[/TD]
[TD]Microsoft Cor,[/TD]
[/TR]
[TR]
[TD]Microsoft Cor,[/TD]
[TD]Microsoft Corp[/TD]
[/TR]
[TR]
[TD]Hewlett Packard Corp
[/TD]
[TD]Hewlett Packard Crp
[/TD]
[/TR]
[TR]
[TD]Hewlett Packard Corp.
[/TD]
[TD]Hewlett Packard Crp
[/TD]
[/TR]
[TR]
[TD]Hewlett Packard Corpo
[/TD]
[TD]Hewlett Packard Crp
[/TD]
[/TR]
[TR]
[TD]Hewlett Packard Corp,
[/TD]
[TD]Hewlett Packard Crp
[/TD]
[/TR]
[TR]
[TD]Hewlett Packard Crp
[/TD]
[TD]Hewlett Packard Corp,
[/TD]
[/TR]
</tbody>[/TABLE]
I am sorry in advance if I have picked the wrong words for the title of thread (which i think is the case, since web search returned no relevant results). Basically I have a column with company names where the company names were not standardized. Meaning you will see for example, "Microsoft Corp.", "Microsoft Corpo", "Microsoft Corp" or "Hewlett Packard Corp", "Hewlett-Packard Corp." etc. This is annoying in particular when trying to do some metrics and Pivot Graphs because you will see various data points for the same company.
So what I thought of doing is to use Approximate string matching to unify the list by having one name for each company. I wrote a little code shown below, I am close but not quite there yet. I am wondering if anybody has better suggestions.
Please note that in my case I don't really care which name to be used as long as there is only one per company
Code:
Sub FuzzyMatchMassaging()
Dim v() As Variant: v() = Range("A2:B10").Value2 ' output will be in the B column
Dim i As Long, j As Long
Dim tempPer As Single: tempPer = 0
Dim tempName As String
For i = LBound(v, 1) To UBound(v, 1)
tempName = v(i, 1)
For j = LBound(v, 1) To UBound(v, 1)
If v(i, 1) <> v(j, 1) Then
Dim placeHolder As Single: placeHolder = Strings.JW(Mid(v(i, 1), 1, 10), Mid(v(j, 1), 1, 10)) ' do the approximate string matching on the first 10 characters of the string
If placeHolder >= tempPer Then
tempPer = placeHolder
tempName = v(j, 1)
End If
End If
Next j
If Strings.JW(Mid(v(i, 1), 1, 10), Mid(tempName, 1, 10)) >= 0.75 Then
v(i, 2) = tempName
Else
v(i, 2) = v(i, 1)
End If
Next i
Range("A2:B10").Value2 = v
End Sub
Function JW(ByVal str1 As String, ByVal str2 As String) As Double 'Jaro-Winkler distance
Dim l1, l2, lmin, lmax, m, i, j As Integer
Dim common As Integer
Dim tr As Double
Dim a1, a2 As String
l1 = Len(str1)
l2 = Len(str2)
If l1 > l2 Then
aux = l2
l2 = l1
l1 = aux
auxstr = str1
str1 = str2
str2 = auxstr
End If
lmin = l1
lmax = l2
Dim f1(), f2() As Boolean
ReDim f1(l1), f2(l2)
For i = 1 To l1
f1(i) = False
Next i
For j = 1 To l2
f2(j) = False
Next j
m = Int((lmax / 2) - 1)
common = 0
tr = 0
For i = 1 To l1
a1 = Mid(str1, i, 1)
If m >= i Then
f = 1
l = i + m
Else
f = i - m
l = i + m
End If
If l > lmax Then
l = lmax
End If
For j = f To l
a2 = Mid(str2, j, 1)
If (a2 = a1) And (f2(j) = False) Then
common = common + 1
f1(i) = True
f2(j) = True
GoTo linea_exit
End If
Next j
linea_exit:
Next i
Dim wcd, wrd, wtr As Double
l = 1
For i = 1 To l1
If f1(i) Then
For j = l To l2
If f2(j) Then
l = j + 1
a1 = Mid(str1, i, 1)
a2 = Mid(str2, j, 1)
If a1 <> a2 Then
tr = tr + 0.5
End If
Exit For
End If
Next j
End If
Next i
wcd = 1 / 3
wrd = 1 / 3
wtr = 1 / 3
If common <> 0 Then
JW = wcd * common / l1 + wrd * common / l2 + wtr * (common - tr) / common
Else
JW = 0
End If
End Function
Sample output
[TABLE="class: grid, width: 286"]
<colgroup><col><col></colgroup><tbody></tbody>[/TABLE]
[TABLE="class: grid, width: 286, align: left"]
<colgroup><col><col></colgroup><tbody>[TR]
[TD]PreProcessing
[/TD]
[TD]AfterProcessing[/TD]
[/TR]
[TR]
[TD]Microsoft Corp.[/TD]
[TD]Microsoft Cor,[/TD]
[/TR]
[TR]
[TD]Microsoft Corpo[/TD]
[TD]Microsoft Cor,[/TD]
[/TR]
[TR]
[TD]Microsoft Corp[/TD]
[TD]Microsoft Cor,[/TD]
[/TR]
[TR]
[TD]Microsoft Cor,[/TD]
[TD]Microsoft Corp[/TD]
[/TR]
[TR]
[TD]Hewlett Packard Corp
[/TD]
[TD]Hewlett Packard Crp
[/TD]
[/TR]
[TR]
[TD]Hewlett Packard Corp.
[/TD]
[TD]Hewlett Packard Crp
[/TD]
[/TR]
[TR]
[TD]Hewlett Packard Corpo
[/TD]
[TD]Hewlett Packard Crp
[/TD]
[/TR]
[TR]
[TD]Hewlett Packard Corp,
[/TD]
[TD]Hewlett Packard Crp
[/TD]
[/TR]
[TR]
[TD]Hewlett Packard Crp
[/TD]
[TD]Hewlett Packard Corp,
[/TD]
[/TR]
</tbody>[/TABLE]