I'm late to the party, here is an Excel VBA script I wrote years ago for Australia. It can be easily modified to support other Countries. I've made a GitHub repository of the C# code here. I've hosted it on my site and you can download it here: http://jeremythompson.net/rocks/ParseAddress.xlsm
For any country with a PostCode that's numeric or can be matched with a RegEx my strategy works very well:
First we detect the First and Surname which are assumed to be the top line. Its easy to skip the name and start with the address by unticking the checkbox (called 'Name is top row' as shown below).
Next its safe to expect the Address consisting of the Street and Number come before the Suburb and the St, Pde, Ave, Av, Rd, Cres, loop, etc is a separator.
Detecting the Suburb vs the State and even Country can trick the most sophisticated parsers as there can be conflicts. To overcome this I use a PostCode look up based on the fact that after stripping Street and Apartment/Unit numbers as well as the PoBox,Ph,Fax,Mobile etc, only the PostCode number will remain. This is easy to match with a regEx to then look up the suburb(s) and country.
Your National Post Office Service will provide a list of post codes with Suburbs and States free of charge that you can store in an excel sheet, db table, text/json/xml file, etc.
DISCLAIMER, I know this code is not perfect, or even written well however its very easy to convert to any programming language and run in any type of application.The strategy is the answer depending on your country and rules, take this code as an example:
Option Explicit
Private Const TopRow As Integer = 0
Public Sub ParseAddress()
Dim strArr() As String
Dim sigRow() As String
Dim i As Integer
Dim j As Integer
Dim k As Integer
Dim Stat As String
Dim SpaceInName As Integer
Dim Temp As String
Dim PhExt As String
On Error Resume Next
Temp = ActiveSheet.Range("Address")
'Split info into array
strArr = Split(Temp, vbLf)
'Trim the array
For i = 0 To UBound(strArr)
strArr(i) = VBA.Trim(strArr(i))
Next i
'Remove empty items/rows
ReDim sigRow(LBound(strArr) To UBound(strArr))
For i = LBound(strArr) To UBound(strArr)
If Trim(strArr(i)) <> "" Then
sigRow(j) = strArr(i)
j = j + 1
End If
Next i
ReDim Preserve sigRow(LBound(strArr) To j)
'Find the name (MUST BE ON THE FIRST ROW UNLESS CHECKBOX UNTICKED)
i = TopRow
If ActiveSheet.Shapes("chkFirst").ControlFormat.Value = 1 Then
SpaceInName = InStr(1, sigRow(i), " ", vbTextCompare) - 1
If ActiveSheet.Shapes("chkConfirm").ControlFormat.Value = 0 Then
ActiveSheet.Range("FirstName") = VBA.Left(sigRow(i), SpaceInName)
Else
If MsgBox("First Name: " & VBA.Mid$(sigRow(i), 1, SpaceInName), vbQuestion + vbYesNo, "Confirm Details") = vbYes Then ActiveSheet.Range("FirstName") = VBA.Left(sigRow(i), SpaceInName)
End If
If ActiveSheet.Shapes("chkConfirm").ControlFormat.Value = 0 Then
ActiveSheet.Range("Surname") = VBA.Mid(sigRow(i), SpaceInName + 2)
Else
If MsgBox("Surame: " & VBA.Mid(sigRow(i), SpaceInName + 2), vbQuestion + vbYesNo, "Confirm Details") = vbYes Then ActiveSheet.Range("Surname") = VBA.Mid(sigRow(i), SpaceInName + 2)
End If
sigRow(i) = ""
End If
'Find the Street by looking for a "St, Pde, Ave, Av, Rd, Cres, loop, etc"
For i = 1 To UBound(sigRow)
If Len(sigRow(i)) > 0 Then
For j = 0 To 8
If InStr(1, VBA.UCase(sigRow(i)), Street(j), vbTextCompare) > 0 Then
'Find the position of the street in order to get the suburb
SpaceInName = InStr(1, VBA.UCase(sigRow(i)), Street(j), vbTextCompare) + Len(Street(j)) - 1
'If its a po box then add 5 chars
If VBA.Right(Street(j), 3) = "BOX" Then SpaceInName = SpaceInName + 5
If ActiveSheet.Shapes("chkConfirm").ControlFormat.Value = 0 Then
ActiveSheet.Range("Street") = VBA.Mid(sigRow(i), 1, SpaceInName)
Else
If MsgBox("Street Address: " & VBA.Mid(sigRow(i), 1, SpaceInName), vbQuestion + vbYesNo, "Confirm Details") = vbYes Then ActiveSheet.Range("Street") = VBA.Mid(sigRow(i), 1, SpaceInName)
End If
'Trim the Street, Number leaving the Suburb if its exists on the same line
sigRow(i) = VBA.Mid(sigRow(i), SpaceInName) + 2
sigRow(i) = Replace(sigRow(i), VBA.Mid(sigRow(i), 1, SpaceInName), "")
GoTo PastAddress:
End If
Next j
End If
Next i
PastAddress:
'Mobile
For i = 1 To UBound(sigRow)
If Len(sigRow(i)) > 0 Then
For j = 0 To 3
Temp = Mb(j)
If VBA.Left(VBA.UCase(sigRow(i)), Len(Temp)) = Temp Then
If ActiveSheet.Shapes("chkConfirm").ControlFormat.Value = 0 Then
ActiveSheet.Range("Mobile") = VBA.Mid(sigRow(i), Len(Temp) + 2)
Else
If MsgBox("Mobile: " & VBA.Mid(sigRow(i), Len(Temp) + 2), vbQuestion + vbYesNo, "Confirm Details") = vbYes Then ActiveSheet.Range("Mobile") = VBA.Mid(sigRow(i), Len(Temp) + 2)
End If
sigRow(i) = ""
GoTo PastMobile:
End If
Next j
End If
Next i
PastMobile:
'Phone
For i = 1 To UBound(sigRow)
If Len(sigRow(i)) > 0 Then
For j = 0 To 1
Temp = Ph(j)
If VBA.Left(VBA.UCase(sigRow(i)), Len(Temp)) = Temp Then
'TODO: Detect the intl or national extension here.. or if we can from the postcode.
If ActiveSheet.Shapes("chkConfirm").ControlFormat.Value = 0 Then
ActiveSheet.Range("Phone") = VBA.Mid(sigRow(i), Len(Temp) + 3)
Else
If MsgBox("Phone: " & VBA.Mid(sigRow(i), Len(Temp) + 3), vbQuestion + vbYesNo, "Confirm Details") = vbYes Then ActiveSheet.Range("Phone") = VBA.Mid(sigRow(i), Len(Temp) + 3)
End If
sigRow(i) = ""
GoTo PastPhone:
End If
Next j
End If
Next i
PastPhone:
'Email
For i = 1 To UBound(sigRow)
If Len(sigRow(i)) > 0 Then
'replace with regEx search
If InStr(1, sigRow(i), "@", vbTextCompare) And InStr(1, VBA.UCase(sigRow(i)), ".CO", vbTextCompare) Then
Dim email As String
email = sigRow(i)
email = Replace(VBA.UCase(email), "EMAIL:", "")
email = Replace(VBA.UCase(email), "E-MAIL:", "")
email = Replace(VBA.UCase(email), "E:", "")
email = Replace(VBA.UCase(Trim(email)), "E ", "")
email = VBA.LCase(email)
If ActiveSheet.Shapes("chkConfirm").ControlFormat.Value = 0 Then
ActiveSheet.Range("Email") = email
Else
If MsgBox("Email: " & email, vbQuestion + vbYesNo, "Confirm Details") = vbYes Then ActiveSheet.Range("Email") = email
End If
sigRow(i) = ""
Exit For
End If
End If
Next i
'Now the only remaining items will be the postcode, suburb, country
'there shouldn't be any numbers (eg. from PoBox,Ph,Fax,Mobile) except for the Post Code
'Join the string and filter out the Post Code
Temp = Join(sigRow, vbCrLf)
Temp = Trim(Temp)
For i = 1 To Len(Temp)
Dim postCode As String
postCode = VBA.Mid(Temp, i, 4)
'In Australia PostCodes are 4 digits
If VBA.Mid(Temp, i, 1) <> " " And IsNumeric(postCode) Then
If ActiveSheet.Shapes("chkConfirm").ControlFormat.Value = 0 Then
ActiveSheet.Range("PostCode") = postCode
Else
If MsgBox("Post Code: " & postCode, vbQuestion + vbYesNo, "Confirm Details") = vbYes Then ActiveSheet.Range("PostCode") = postCode
End If
'Lookup the Suburb and State based on the PostCode, the PostCode sheet has the lookup
Dim mySuburbArray As Range
Set mySuburbArray = Sheets("PostCodes").Range("A2:B16670")
Dim suburbs As String
For j = 1 To mySuburbArray.Columns(1).Cells.Count
If mySuburbArray.Cells(j, 1) = postCode Then
'Check if the suburb is listed in the address
If InStr(1, UCase(Temp), mySuburbArray.Cells(j, 2), vbTextCompare) > 0 Then
'Set the Suburb and State
ActiveSheet.Range("Suburb") = mySuburbArray.Cells(j, 2)
Stat = mySuburbArray.Cells(j, 3)
ActiveSheet.Range("State") = Stat
'Knowing the State - for Australia we can get the telephone Ext
PhExt = PhExtension(VBA.UCase(Stat))
ActiveSheet.Range("PhExt") = PhExt
'remove the phone extension from the number
Dim prePhone As String
prePhone = ActiveSheet.Range("Phone")
prePhone = Replace(prePhone, PhExt & " ", "")
prePhone = Replace(prePhone, "(" & PhExt & ") ", "")
prePhone = Replace(prePhone, "(" & PhExt & ")", "")
ActiveSheet.Range("Phone") = prePhone
Exit For
End If
End If
Next j
Exit For
End If
Next i
End Sub
Private Function PhExtension(ByVal State As String) As String
Select Case State
Case Is = "NSW"
PhExtension = "02"
Case Is = "QLD"
PhExtension = "07"
Case Is = "VIC"
PhExtension = "03"
Case Is = "NT"
PhExtension = "04"
Case Is = "WA"
PhExtension = "05"
Case Is = "SA"
PhExtension = "07"
Case Is = "TAS"
PhExtension = "06"
End Select
End Function
Private Function Ph(ByVal Num As Integer) As String
Select Case Num
Case Is = 0
Ph = "PH"
Case Is = 1
Ph = "PHONE"
'Case Is = 2
'Ph = "P"
End Select
End Function
Private Function Mb(ByVal Num As Integer) As String
Select Case Num
Case Is = 0
Mb = "MB"
Case Is = 1
Mb = "MOB"
Case Is = 2
Mb = "CELL"
Case Is = 3
Mb = "MOBILE"
'Case Is = 4
'Mb = "M"
End Select
End Function
Private Function Fax(ByVal Num As Integer) As String
Select Case Num
Case Is = 0
Fax = "FAX"
Case Is = 1
Fax = "FACSIMILE"
'Case Is = 2
'Fax = "F"
End Select
End Function
Private Function State(ByVal Num As Integer) As String
Select Case Num
Case Is = 0
State = "NSW"
Case Is = 1
State = "QLD"
Case Is = 2
State = "VIC"
Case Is = 3
State = "NT"
Case Is = 4
State = "WA"
Case Is = 5
State = "SA"
Case Is = 6
State = "TAS"
End Select
End Function
Private Function Street(ByVal Num As Integer) As String
Select Case Num
Case Is = 0
Street = " ST"
Case Is = 1
Street = " RD"
Case Is = 2
Street = " AVE"
Case Is = 3
Street = " AV"
Case Is = 4
Street = " CRES"
Case Is = 5
Street = " LOOP"
Case Is = 6
Street = "PO BOX"
Case Is = 7
Street = " STREET"
Case Is = 8
Street = " ROAD"
Case Is = 9
Street = " AVENUE"
Case Is = 10
Street = " CRESENT"
Case Is = 11
Street = " PARADE"
Case Is = 12
Street = " PDE"
Case Is = 13
Street = " LANE"
Case Is = 14
Street = " COURT"
Case Is = 15
Street = " BLVD"
Case Is = 16
Street = "P.O. BOX"
Case Is = 17
Street = "P.O BOX"
Case Is = 18
Street = "PO BOX"
Case Is = 19
Street = "POBOX"
End Select
End Function