File and Web Scraper - JFO Open-source Code

'Web Scraper - Console Application
'Copyright (c) 2013 State of Vermont Legislative Joint Fiscal Office.
'Licensed under the Apache License, Version 2.0 (the "License");
'you may not use this file except in compliance with the License.
'You may obtain a copy of the License at
 '      			
'http://www.apache.org/licenses/LICENSE-2.0
'
'Unless required by applicable law or agreed to in writing, software
'distributed under the License is distributed on an "AS IS" BASIS,
'WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
'See the License for the specific language governing permissions and
'limitations under the License.	

Imports System.IO
Imports System.Text.RegularExpressions
Imports System.Collections

Namespace JFO
    Module WebScraper

        Public Class GraphNode
            Public thisNode As String = New String("")
            Public toNode() As String = Nothing
            Public toName() As String = Nothing
            Public toDate() As String = Nothing
            Public currentDim As Integer
            Sub New()
                Dim dimension As Integer = 300
                ReDim Me.toNode(dimension)
                ReDim Me.toName(dimension)
                ReDim Me.toDate(dimension)
                For i = 0 To dimension - 1
                    Me.toNode(i) = ""
                    Me.toName(i) = ""
                    Me.toDate(i) = ""
                Next
            End Sub
        End Class
        Private mySiteLen As Integer = 300
        Private graphIndex As Integer = -1
        Private FileQueue As Queue = New Queue(300)

        Sub Main()
            'Start with the websites root folder, and the menu, home page, or site map file
            Dim path As String = "C:\MySite\"
            Dim firstFile As String = "menu.html"

            Dim myFile = New StreamReader(path & firstFile)

            graphIndex = 0

            'The main data structure for link data
            Dim mySite(mySiteLen) As Object
            For i = 0 To mySiteLen - 1
                mySite(i) = New GraphNode
            Next

            Dim followQueue = New Queue(100)
            Dim followIndex As Integer = 0

            'output file that stores the links between files
            Dim outputPath As String = path & "myIndex.txt"
            Dim outputFile As New StreamWriter(outputPath)

            mySite = readStreamLine(myFile, firstFile, mySite, outputFile)

            Dim linkName As String

            For recurCount = 0 To 3
                For anchors = 0 To mySite.Count() - 1
                    linkName = mySite(anchors).thisNode

                    If Not File.Exists(path + linkName) Then
                        Continue For
                    End If

                    If (linkName.IndexOf("#") = -1) And (linkName.IndexOf("http") = -1) And ((linkName.IndexOf(".html") <> -1) Or (linkName.IndexOf(".aspx") <> -1) Or (linkName.IndexOf(".php") <> -1)) Then
                        mySite = readStreamLine(myFile, linkName, mySite, outputFile)
                    End If
                Next
            Next
        End Sub

        Function readStreamLine(myFile As StreamReader, parentLink As String, ByRef mySite As Object(), ByRef outFile As StreamWriter) As Object()
            Dim fileText As String = myFile.ReadToEnd()
            graphIndex += 1
            mySite = parseLine(fileText, parentLink, mySite, outFile)

            Return mySite
        End Function


        Function parseLine(fileText As String, parentLink As String, ByRef mySite As Object(), ByRef outFile As StreamWriter) As Object()
            If graphIndex = mySiteLen Then
                DumpToJSON(mySite, outFile)
            End If

            mySite(graphIndex).thisNode = parentLink

            Dim anchorPattern As String = "<a\s+([^>]*)href=" & Chr(34) & "(([^" & Chr(34) & "]*))" & Chr(34) & "([^>]*)(.*?)>(.*?)<\/a>"
            Dim namePattern As String = "(?<=<a\s+[^>]*>).*(?=</a>)"
            Dim datePattern As String = "(?<=</a>).*(?=<)"

            Dim anchorMatches As MatchCollection = Regex.Matches(fileText, anchorPattern)
            Dim nameMatches As MatchCollection = Regex.Matches(fileText, namePattern)
            Dim dateMatches As MatchCollection = Regex.Matches(fileText, datePattern)

            Dim n As Integer = 0
            Dim linkStart As Integer = 0
            Dim linkMiddle As String
            Dim linkEnd As Integer
            Dim linkFinal As String

            For Each link In anchorMatches
                linkStart = link.ToString.IndexOf(Chr(34))
                linkMiddle = link.ToString.Substring(linkStart + 1)
                linkEnd = linkMiddle.IndexOf(Chr(34))
                linkFinal = linkMiddle.Substring(0, linkEnd)


                mySite(graphIndex).toNode(n) = linkFinal
                mySite(graphIndex).toName(n) = nameMatches(n).ToString()
                Try
                    mySite(graphIndex).toDate(n) = dateMatches(n).ToString()
                Catch e As Exception

                End Try
                n += 1

				'Add all protocols and file extensions here
                If (linkFinal.IndexOf("#") = -1) And (linkFinal.IndexOf("http") = -1) And ((linkFinal.IndexOf(".html") <> -1) Or (linkFinal.IndexOf(".aspx") <> -1) Or (linkFinal.IndexOf(".php") <> -1)) Then
                    FileQueue.Enqueue(linkFinal)
                End If
            Next

            Dim FQCount As Integer = FileQueue.Count

            If FQCount > 0 Then
                For q = 0 To FQCount - 1
                    If (FileQueue.Count()) > 0 Then
                        Dim thisFile = FileQueue.Dequeue()

                        Dim newFile As StreamReader
                        Try
                            newFile = openFile(thisFile)
                            mySite = readStreamLine(newFile, thisFile, mySite, outFile)
                            newFile.Close()
                        Catch

                        End Try
                    End If
                Next
            End If
            Return mySite
        End Function



        'Open the menu file, the starting point for scraping the website
        Function openFile(fileName As String) As StreamReader
            Dim filePath As String = "C:\MyWebSite" + fileName
            Dim webFile As New StreamReader(filePath)
            Return webFile
        End Function


        'Open the menu file, the starting point for scraping the website
        Function openMenu() As FileStream
            Dim menuPath As String = "C:\MyWebSite\menu.html"
            Dim menuFile As FileStream = File.Open(menuPath, FileMode.Open)
            Return menuFile
        End Function

        Function DumpToJSON(mySite As Object(), outFile As StreamWriter)
            'Write the opening JSON brace
            outFile.WriteLine("{")

            Dim c As Integer = 0




            'loop through the root pages
            For i = 0 To mySiteLen - 1
                If mySite(i).thisNode = "" Then
                    If (i = mySiteLen - 1) Then
                        outFile.WriteLine("}")
                    End If
                    Continue For
                End If


                'increment the counter for each valid entry in the mySite array
                c += 1

                outFile.WriteLine(Chr(34) & "node" & c & Chr(34) & ": {")

                outFile.WriteLine(Chr(34) & "thisNode" & Chr(34) & ":" & Chr(34) & mySite(i).thisNode & Chr(34) & ",")

                outFile.WriteLine(Chr(34) & "paths" & Chr(34) & ": [")
                'Loop through the associated pages

                For n = 0 To 299
                    If mySite(i).toNode(n) = "" Then
                        If mySite(i).toName(n) = "" Then Continue For
                    End If

                    If n > 0 Then
                        outFile.Write(",")
                    End If
                    outFile.WriteLine("{")
                    outFile.WriteLine(Chr(34) & "toNode" & Chr(34) & ":" & Chr(34) & mySite(i).toNode(n) & Chr(34) & ",")
                    outFile.WriteLine(Chr(34) & "toName" & Chr(34) & ":" & Chr(34) & mySite(i).toName(n).ToString().Replace(Chr(34), "\" & Chr(34)) & Chr(34) & ",")
                    outFile.WriteLine(Chr(34) & "toDate" & Chr(34) & ":" & Chr(34) & mySite(i).toDate(n) & Chr(34))
                    outFile.Write("}")
                Next
                outFile.WriteLine("]")

                If i = mySiteLen - 1 Then
                    outFile.WriteLine("}")
                Else
                    outFile.WriteLine("},")
                End If
            Next
            'Write the closing brace
            outFile.WriteLine("}")

            outFile.Close()

            End
        End Function

    End Module
End Namespace