Hi Cyberdude,
I think there is more to calculating similarity between strings.
However, this would be kind of coding for your idea,
though I doubt whether the result
will reveil what you are looking for.
Public Function SimilarityNumber(a$, b$) As Long
Dim lngI As Long ' just a counter
Dim lngS As Long ' resulting similarity
ReDim na(1 To Len(a$)) As Long ' array of a$
ReDim nb(1 To Len(a$)) As Long ' array of b$
For lngI = 1 To Len(a$)
na(lngI) = Asc(Mid(a$, lngI, 1)) - 96
nb(lngI) = Asc(Mid(b$, lngI, 1)) - 96
lngS = lngS + (na(lngI) - nb(lngI)) ^ 2
Next
SimilarityNumber = lngS
End Function
Sub Test400()
MsgBox SimilarityNumber("apples", "yyyyyy")
End Sub
' -----------------------------------------
for a more professional approach,
I'm quoting from a former posting of mine.
I just isn't that easy.
Start quote:
IMHO, I think, my solution is in principle better
than the "levenshtein distance".
Somewhat to google for.
If you want to know it all, the code below
will give you a correlation coefficient between two strings.
Don't be afraid, some things are complicated,
and there is no easy to understand solution.
Disregard the comments in german.
To comment it all, it would take me a week.
For
str1 = "alberto parreira , alberto juan fernandez parreira"
str2 = "alberto-juan parreira , alberto juuan parreira."
I get
Correlation(Character) = 0.82
Correlation(Substring) = 0.54
Correlation(combined ) = 0.72
Whether this is sufficient for you, I don't know.
Just have a go and good luck,
and beware of line breaks by the newsreader.
Option Explicit
Sub Correlation()
Dim str1 As String
Dim str2 As String
Dim CorChrc As Single ' correlation by character
Dim CorStrn As Single ' correlation by string
str1 = "alberto parreira , alberto juan fernandez parreira"
str2 = "alberto-juan parreira , alberto juuan parreira."
CorChrc = FncCorChr(str1, str2)
CorStrn = FncCorStr(str1, str2)
Debug.Print "Correlation(Character) = " & Format(CorChrc, " 0.00")
Debug.Print "Correlation(Substring) = " & Format(CorStrn, " 0.00")
Debug.Print _
"Correlation(combined ) = " _
& Format((CorChrc * 2 + CorStrn) / 3, " 0.00")
End Sub
Public Function FncCorChr(str1$, str2$) As Single
' Correlation by set of characters
' ============================
' Union = Anzahl(1) + Anzahl(2)
' Relation = kleinerer Wert durch größerer Wert
' Durchschnitt = Relation /2
' gewichteter Durchschnitt = Durchschnitt * Union
'
' Word Correlation WrdCor
' NumCom = Summe aller Vergleiche
' WrdCor = Summe aller ZeichenCorrelationen
' WrdCor = WrdCor / (NumCom/2) 'Durchschnitt der Vergleiche
Dim l As Long
Dim ChrNum As Long
Dim ComNum As Long ' number of comparisons
Dim ChrCor As Single ' character correlation
Dim WrdCor As Single
Dim Union As Long
Dim ArChr01(32 To 255) As Long
Dim ArChr02(32 To 255) As Long
For l = 32 To 255 ' clear arrays
ArChr01(l) = 0
ArChr02(l) = 0
Next
For l = 1 To Len(str1) ' count frequency
ChrNum = Asc(Mid$(str1, l, 1))
ArChr01(ChrNum) = ArChr01(ChrNum) + 1
Next
For l = 1 To Len(str2)
ChrNum = Asc(Mid$(str2, l, 1))
ArChr02(ChrNum) = ArChr02(ChrNum) + 1
Next
ComNum = 0
WrdCor = 0
For l = 32 To 255
Union = ArChr01(l) + ArChr02(l)
If Union = 0 Then GoTo fertig ' Don't process
If ArChr01(l) = 0 Or ArChr02(l) = 0 Then ' zero anyway
ChrCor = 0
GoTo weiter
End If
If ArChr01(l) = ArChr02(l) Then ' short cut
ChrCor = Union / 2
GoTo weiter
End If
If ArChr01(l) <> ArChr02(l) Then
If ArChr01(l) > ArChr02(l) Then
ChrCor = ArChr02(l) / ArChr01(l)
ChrCor = ChrCor / 2
ChrCor = ChrCor * Union
End If
If ArChr01(l) < ArChr02(l) Then
ChrCor = ArChr01(l) / ArChr02(l)
ChrCor = ChrCor / 2
ChrCor = ChrCor * Union
End If
End If
weiter:
WrdCor = WrdCor + ChrCor
fertig:
Next l
ComNum = 0
For l = 32 To 255
ComNum = ComNum + ArChr01(l) + ArChr02(l)
Next
FncCorChr = WrdCor / (ComNum / 2)
End Function
Public Function FncCorStr(LongStr$, ShrtStr$) As Single
' get substrings longer than minimum length
' get number of all strings
' get number of common strings
' calculate relation of common strings to all strings
Dim ShrtLen As Long
Dim LongLen As Long
Dim f As Boolean ' found
Dim IsInComm As Boolean ' substring is in common
Dim l As Long
Dim m As Long
Dim n As Long
Dim p As Long ' position
Dim s0 As String
Dim S1 As String
Dim ShrtLoc As String
Dim LongLoc As String
Dim TempLoc As String
Dim shrtMin As Long
Dim ShrtSum As Long ' 1 + 2 + n for long
Dim LongSum As Long ' 1 + 2 + n for long
Dim HalfArr As Long ' half way of array
Dim HalfStp As Long
Dim ComLSum As Long ' sum of length of common
Dim ShrtStrItm() As String ' substrings short
Dim LongStrItm() As String ' substrings long
Dim CommStrItm() As String ' common strings
Dim ShrtStrFrq() As Long ' frequency short
Dim LongStrFrq() As Long ' frequency long
ShrtLoc = ShrtStr ' local value
LongLoc = LongStr ' local value
ShrtSum = 0
LongSum = 0
LongLoc = LCase(LongLoc)
ShrtLoc = LCase(ShrtLoc)
If Len(ShrtLoc) > Len(LongLoc) Then
TempLoc = LongLoc
LongLoc = ShrtLoc
ShrtLoc = TempLoc
End If
LongLen = Len(LongLoc)
ShrtLen = Len(ShrtLoc)
shrtMin = 2 ' CLng(TxSubMin.Text)
' kürzester zu untersuchender Substring
' ------------------------------------- number of substrings
' --------------------------------------------- Summenformel
For l = 1 To ShrtLen - (shrtMin - 1)
ShrtSum = ShrtSum + l
Next
' ---------------------------- redim array for short strings
ReDim ShrtStrItm(ShrtSum)
ReDim ShrtStrFrq(ShrtSum)
For l = 1 To LongLen - (shrtMin - 1)
LongSum = LongSum + l
Next
' ----------------------------- redim array for long strings
ReDim LongStrItm(LongSum)
ReDim LongStrFrq(LongSum)
'___________________________________________________________
' ---------------------- add subs of shorter string to array
n = 0
For l = 1 To ShrtLen - (shrtMin - 1) ' 1 2
p = 0 ' 5
For m = 1 To l
n = n + 1
p = p + 1
ShrtStrItm(n) = Mid(ShrtLoc, p, ShrtLen - l + 1)
Next
Next
' ----------------------- add subs of longer string to array
n = 0
For l = 1 To LongLen - (shrtMin - 1) ' 1 2
p = 0 ' 5
For m = 1 To l
n = n + 1
p = p + 1
LongStrItm(n) = Mid(LongLoc, p, LongLen - l + 1)
Next
Next
' ----------------------------------- Count freqencies short
' -------------------------- get index of first short string
' ------------------------- equal half length of long string
HalfStp = 0
HalfArr = CLng((ShrtLen) / 2)
For l = 1 To HalfArr
HalfStp = HalfStp + l
Next
For l = 1 To HalfStp
ShrtStrFrq(l) = 1
Next
For l = HalfStp + 1 To ShrtSum
ShrtStrFrq(l) = FncStrCnt(ShrtLoc, ShrtStrItm(l))
Next
' --------------------------------- remove double from array
For l = 1 To ShrtSum
For m = l + 1 To ShrtSum
If ShrtStrItm(l) = ShrtStrItm(m) Then
For n = m To ShrtSum - 1
ShrtStrItm(n) = ShrtStrItm(n + 1) ' verschieben
ShrtStrFrq(n) = ShrtStrFrq(n + 1)
Next n
ShrtSum = ShrtSum - 1
ReDim Preserve ShrtStrItm(ShrtSum)
ReDim Preserve ShrtStrFrq(ShrtSum)
Exit For
End If
Next m
Next l
' ---------------- end of collecting data for shorter string
'___________________________________________________________
' ----------------------------------- first common substring
' --------------------------- beware of no common substrings
f = False
For l = 1 To ShrtSum
If InStr(LongLoc, ShrtStrItm(l)) > 0 Then
f = True
ReDim CommStrItm(1)
CommStrItm(1) = ShrtStrItm(l)
Exit For
End If
Next
If f = False Then
FncCorStr = 0
Exit Function
End If
n = 1
'--------------------------------- further common substrings
S1 = CommStrItm(1)
For m = l + 1 To ShrtSum ' ab gefunden weitersuchen
s0 = ShrtStrItm(m)
If (InStr(LongLoc, s0) > 0) Then
IsInComm = False
For p = 1 To n
If InStr(CommStrItm(p), s0) > 0 Then
IsInComm = True
Exit For
End If
Next
If Not IsInComm Then
n = n + 1
ReDim Preserve CommStrItm(n)
CommStrItm(n) = s0
End If
End If
Next
ComLSum = 0
For l = 1 To n
ComLSum = ComLSum + Len(CommStrItm(l))
Next
If ComLSum > LongLen Then
FncCorStr = LongLen / ComLSum
Else
FncCorStr = ComLSum / LongLen
End If
End Function
Public Function FncStrCnt(Lng$, Shr$) As Long
' ----------------------------------- count string in string
Dim l As Long ' position
Dim m As Long ' counter
l = 1
m = 0
While InStr(l, Lng, Shr) > 0
m = m + 1
l = InStr(l, Lng, Shr) + 1
Wend
FncStrCnt = m
End Function
--
Greetings from Bavaria, Germany
Helmut Weber, MVP WordVBA
Vista Small Business, Office XP