.NET convert hyperlink to url string

try converting html code to text version? below code will remove all html tags

Dim objRegEx As System.Text.RegularExpressions.Regex
               mystring= objRegEx.Replace(mystring, "<[^>]*>", "")

But what if I also need to extract and keep all the urls from <a>tags.
I write a recursion function to convert all <a></a> to simple url string, it is in VB because the original project was developed in VB, yes I miss C# syntax 😦

Public Function ConvertHyperlinksToText(ByVal source As String, ByVal startIndex As Integer) As String

        Dim result As String   ' Result to return
        result = source

        ' wrap the extracted URL string , for example you can use (),
        ' so "<a href="http://www.google.com">Google</a>" will become "(http://www.google.com)"
        ' in this example I use space before and after the url

        Dim urlLeftWrapperString As String = " "
        Dim urlRightWrapperString As String = " "

        ' Tag Start & End
        Dim tagStart As String = "<a "
        Dim tagEnd As String = "</a>"

        ' URL link Start & End, surround by double quote: href="http://www.google.com"
        Dim doubleQuoteLinkStart As String = " href="""
        Dim doubleQuoteLinkEnd As String = """"

        ' URL link Start & End, surround by single quote: href='http://www.google.com'
        Dim singleQuoteLinkStart As String = " href='"
        Dim singleQuoteLinkEnd As String = "'"

        Dim pos_tagStart As Integer
        Dim pos_tagEnd As Integer

        ' find position of tagStart and tagend

        pos_tagStart = result.ToLower().IndexOf(tagStart, startIndex)
        pos_tagEnd = result.ToLower().IndexOf(tagEnd, startIndex)

        ' Find the next  tag block

        If pos_tagStart > -1 And pos_tagEnd > -1 And pos_tagEnd > pos_tagStart Then
            Dim stringLeft As String
            Dim stringTagBlock As String
            Dim stringRight As String

            stringLeft = result.Substring(0, pos_tagStart)
            stringTagBlock = result.Substring(pos_tagStart, pos_tagEnd + 4 - pos_tagStart)   ' content in 
            stringRight = result.Substring(pos_tagEnd + 4, result.Length - (pos_tagEnd + 4))    '  is 4 characters

            If stringTagBlock <> "" Then

                Dim findHref As Boolean = False

                ' try searching for Double Quote surrounded href attribute:  href="...." start

                Dim pos_doubleQuoteLinkStart As Integer
                Dim pos_doubleQuoteLinkEnd As Integer

                pos_doubleQuoteLinkStart = stringTagBlock.ToLower().IndexOf(doubleQuoteLinkStart)
                pos_doubleQuoteLinkEnd = stringTagBlock.ToLower().IndexOf(doubleQuoteLinkEnd, pos_doubleQuoteLinkStart + 7)

                If pos_doubleQuoteLinkStart > -1 And pos_doubleQuoteLinkEnd > -1 And pos_doubleQuoteLinkEnd > pos_doubleQuoteLinkStart Then

                    Dim URL As String

                    URL = stringTagBlock.Substring(pos_doubleQuoteLinkStart + 7, pos_doubleQuoteLinkEnd - (pos_doubleQuoteLinkStart + 7)) ' " href=""  -  7 characters

                    ' concatenation result String :   stringLeft +  + URL +  + stringRight

                    result = stringLeft & urlLeftWrapperString & URL & urlRightWrapperString & stringRight
                    startIndex = ((stringLeft & urlLeftWrapperString & URL & urlRightWrapperString).Length) 'just set startIndex to the end of the tagEnd

                    findHref = True

                End If

                'if we haven't find double quote surrounded href attribute
                ' try searching for Single Quote surrounded href attribute:  href='....' start

                If findHref = False Then

                    Dim pos_singleQuoteLinkStart As Integer
                    Dim pos_singleQuoteLinkEnd As Integer

                    pos_singleQuoteLinkStart = stringTagBlock.ToLower().IndexOf(singleQuoteLinkStart)
                    pos_singleQuoteLinkEnd = stringTagBlock.ToLower().IndexOf(singleQuoteLinkEnd, pos_singleQuoteLinkStart + 7)

                    If pos_singleQuoteLinkStart > -1 And pos_singleQuoteLinkEnd > -1 And pos_singleQuoteLinkEnd > pos_singleQuoteLinkStart Then

                        Dim URL As String

                        URL = stringTagBlock.Substring(pos_singleQuoteLinkStart + 7, pos_singleQuoteLinkEnd - (pos_singleQuoteLinkStart + 7)) ' " href='"  -  7 characters

                        ' concatenation result String :   stringLeft +  + URL +  + stringRight

                        result = stringLeft & urlLeftWrapperString & URL & urlRightWrapperString & stringRight

                        startIndex = ((stringLeft & urlLeftWrapperString & URL & urlRightWrapperString).Length) 'just set startIndex to the end of the tagEnd

                        findHref = True

                    End If

                End If

                If findHref = False Then ' didn't find herf , just set startIndex to the end of the tagEnd

                    startIndex = (pos_tagEnd + 4)

                End If

            Else

                ' if stringTagBlock is empty, just set startIndex to the end of the tagEnd
                startIndex = (pos_tagEnd + 4)

            End If

            ' if we still can't find href attribute or block content is empty, that is because  block is probably not well formatted
            ' so we just set startIndex from the end of this block, our recursion function is not dead-locked
            ' call this recursion function again
            result = ConvertHyperlinksToText(result, startIndex)

        Else

            ' no  block found

            Return result

        End If

        Return result

    End Function


Advertisements

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out / Change )

Twitter picture

You are commenting using your Twitter account. Log Out / Change )

Facebook photo

You are commenting using your Facebook account. Log Out / Change )

Google+ photo

You are commenting using your Google+ account. Log Out / Change )

Connecting to %s