Recursive Web Crawler in Go

Question

This is probably my third Go application. It essentially takes one or two command line arguments of wikipedia articles and pulls every /wiki/ link that isn't a special page, memoizes them to avoid loading the same page twice, and sees how many 'clicks' it takes to get from the first article to the target article.

As this is only my third Go application, I'm still very new to Go's style, I feel I'm definitely missing something about the error interface and overall it seems a bit messy. Any feedback from experienced Gophers, large or small, would be greatly appreciated.

package main

import (
    "fmt"
    "golang.org/x/net/html"
    "io"
    "net/http"
    "os"
    "strings"
    "time"
)

type Article struct {
    name    string
    url     string
    parent  *Article
    retries uint
}

func main() {
    var target string
    var source string
    args := os.Args[1:]
    if len(args) == 0 {
        fmt.Println("Usage: wikirace SOURCE [DESTINATION]")
        fmt.Println("If DESTINATION is ommited, SOURCE will be used as DESTINATION and SOURCE will be a random article")
        fmt.Println("Format of articles should be either '/wiki/ARTICLENAME' or just 'ARTICLENAME'")
        return
    } else if len(args) == 1 {
        target = args[0]
        source = ""
    } else {
        source = args[0]
        target = args[1]
    }
    if !strings.HasPrefix(target, "/wiki/") {
        target = "/wiki/" + target
    }

    foundChannel := make(chan Article) // indicates target is found when written to
    urlChannel := make(chan Article)   // indicates a new URL needs loaded
    memo := make([]string, 0)          // Slice stores already-visited pages
    buffer := make([]Article, 0, 2000) // Stores pages that need to be loaded
    count := 0                         // Counts currently waiting socket connections to limit file descriptors
    tracker := make(map[string]int)    // Hash map tracks pages that have been requested, but received no results
    if source == "" {
        source = "Special:Random"
    }
    if !strings.HasPrefix(source, "/wiki/") {
        source = "/wiki/" + source
    }
    start := Article{source, "http://en.wikipedia.org" + source, nil, 0}
    count++
    tracker[source] = 1
    fmt.Print("Searching...")
    go LoadPage(start, target, foundChannel, urlChannel)
    // Wait on channels, defaulting to shifting items off the buffer stack
    for {
        select {
        case art := <-urlChannel:
            art.url = "http://en.wikipedia.org" + art.name
            _, present := tracker[art.parent.name] // check to see if the parent name is present in the tracker
            if present {
                delete(tracker, art.parent.name) // delete the parent's name, as that connection has closed
                count--                          // decrement count so we can use another connection
            }
            buffer = append(buffer, art)
        case art := <-foundChannel: // this means the target article was found
            fmt.Println()
            fmt.Println("Found target URL in article", art.name)
            fmt.Println(len(memo), "unique articles searched")
            fmt.Println("Clicks to reach target:")
            thisArticle := Article{target, "", &art, 0} // create an article for the target, so it prints nicely
            path := walkParents(thisArticle)            // get an array of all parents
            top := len(path)
            for i := top - 1; i > -1; i-- { // then print them in reverse order
                fmt.Println(top-(i+1), path[i])
            }
            return
        default:
            if count < 1000 && len(buffer) > 0 {
                next := buffer[0]   // Take the zeroth element
                buffer = buffer[1:] // and remove it from the buffer (FIFO)
                // fmt.Println("Loading page", next.name)
                if !contains(&memo, next.url) {
                    count++                // increment the counter to keep from overflowing file descriptors
                    tracker[next.name] = 1 // record the URL so count can be decremented appropriately
                    memo = append(memo, next.url)
                    go LoadPage(next, target, foundChannel, urlChannel)
                }
            }
        }
    }
}

func walkParents(art Article) []string {
    array := make([]string, 0, 30)
    parent := &art
    for {
        if parent != nil {
            array = append(array, parent.name)
            parent = parent.parent
        } else {
            return array
        }
    }
}

func LoadPage(art Article, target string, found chan Article, c chan Article) {
    fmt.Print(".")
    t := time.Duration(1) * time.Second // sleep to maybe help with DOS prevention
    time.Sleep(t)
    resp := GetUrl(&art)
    urls := FindMainContentLinks(resp)
    for _, value := range urls {
        // fmt.Println("Found link to page:", string(value))
        if value == target {
            found <- art
        }
        new := Article{string(value), "", &art, 0}
        c <- new
    }
    return
}

func GetUrl(art *Article) io.ReadCloser {
    response, err := http.Get(art.url)
    if err != nil {
        if art.retries > 2 {
            panic(err)
        }
        if strings.HasSuffix(err.Error(), "connection reset by peer") {
            fmt.Print("R")
            t := time.Duration(5) * time.Second // sleep to maybe help with DOS prevention and recover from err
            art.retries++
            time.Sleep(t)
            return GetUrl(art)
        } else if strings.HasSuffix(err.Error(), "EOF") {
            fmt.Print("E")
            t := time.Duration(5) * time.Second // sleep to maybe help with DOS prevention and recover from err
            art.retries++
            time.Sleep(t)
            return GetUrl(art)
        } else if strings.HasSuffix(err.Error(), "timeout") {
            fmt.Print("T")
            t := time.Duration(2) * time.Second // sleep to maybe help with DOS prevention and recover from err
            art.retries++
            time.Sleep(t)
            return GetUrl(art)
        } else {
            panic(err)
        }
    }
    return response.Body
}

func FindMainContentLinks(body io.ReadCloser) []string {
    tokenizer := html.NewTokenizer(body)
    urls := make([]string, 0)
    for {
        token := tokenizer.Next()
        switch {
        case token == html.ErrorToken:
            body.Close()
            return urls
        case token == html.StartTagToken:
            tag := tokenizer.Token()
            if tag.Data == "a" {
                for _, attr := range tag.Attr {
                    value := attr.Val
                    if attr.Key == "href" {
                        if strings.HasPrefix(value, "/wiki/") && !strings.Contains(value, ":") && !strings.HasSuffix(value, "Main_Page") {
                            urls = append(urls, value)
                        }
                    }
                }
            }
        }
    }
}

func contains(a *[]string, str string) bool {
    for _, value := range *a {
        if value == str {
            return true
        }
    }
    return false
}

Algo · Accepted Answer · 2016-06-30 12:28:46Z

1) Since both target and source have the same type you can make their declaration more concise:

var target, source string

also in Article definition:

type Article struct {
    name, url    string
    parent       *Article
    retries      uint
}

2) I think it's more clean to make the usage part a separate function. Also you should add more information about what your tool does in the usage.

    func usage() {
        fmt.Println("Wikirace finds out how many 'clicks' it takes to get from the first article to the target article.\n")

        fmt.Println("Usage: wikirace -src='source' -dest='destination'")
        fmt.Println("If 'destination' is omitted, 'source' will be used as 'destination' and 'source' will be a random article")
        fmt.Println("Format of articles should be either '/wiki/article-name' or just 'article-name'")

3) You should use flag package to parse command line arguments, It's more readable and clean.

So instead of:

if len(args) == 0 {
        fmt.Println("Usage: wikirace SOURCE [DESTINATION]")
        fmt.Println("If DESTINATION is ommited, SOURCE will be used as DESTINATION and SOURCE will be a random article")
        fmt.Println("Format of articles should be either '/wiki/ARTICLENAME' or just 'ARTICLENAME'")
        return
    } else if len(args) == 1 {
        target = args[0]
        source = ""
    } else {
        source = args[0]
        target = args[1]
    }

we shall use:

sourcePtr := flag.String("src", "", "Source article")
destPtr := flag.String("dest", "", "Destination article")
flag.Usage = usage

flag.Parse()

source = *sourcePtr 
target = *destPtr

// neither source nor target is specified
if source == "" && target == "" {
  usage()
  return
}

// target is not specified
if target == "" {
  target = source
  source = "Special:Random"
}

3) memo := make([]string, 0) shall simply be var memo []string

4) Why don't you ask the user for just the article name without the /wiki/ prefix and save yourself the unnecessary checks like:

if !strings.HasPrefix(target, "/wiki/") {
        target = "/wiki/" + target
}

if !strings.HasPrefix(source, "/wiki/") {
        source = "/wiki/" + source
}

and simply have the Wikipedia url in your functions in the form http://en.wikipedia.org/wiki/:

const WIKIURL = "http://en.wikipedia.org/wiki/"

start := Article{source, WIKIURL + source, nil, 0}
...
art.url = WIKIURL + art.name

5) Finally, in function GetUrl you're repeating the same code block over and over with multiple ifs for multiple error responses:

func GetUrl(art *Article) io.ReadCloser {
    response, err := http.Get(art.url)
    if err != nil {
        if art.retries > 2 {
            panic(err)
        }
        if strings.HasSuffix(err.Error(), "connection reset by peer") {
            fmt.Print("R")
            t := time.Duration(5) * time.Second // sleep to maybe help with DOS prevention and recover from err
            art.retries++
            time.Sleep(t)
            return GetUrl(art)
        } else if strings.HasSuffix(err.Error(), "EOF") {
            fmt.Print("E")
            t := time.Duration(5) * time.Second // sleep to maybe help with DOS prevention and recover from err
            art.retries++
            time.Sleep(t)
            return GetUrl(art)
        } else if strings.HasSuffix(err.Error(), "timeout") {
            fmt.Print("T")
            t := time.Duration(2) * time.Second // sleep to maybe help with DOS prevention and recover from err
            art.retries++
            time.Sleep(t)
            return GetUrl(art)
        } else {
            panic(err)
        }
    }
    return response.Body
}

which can be avoided as follows:

func GetUrl(art *Article) io.ReadCloser {
    if response, err := http.Get(art.url); err != nil {
        if art.retries > 2 {
            panic(err)
        }

        type ErrorResponse struct {
            ErrorMessage, PrintMessage string
            SleepDuration              uint
        }

        Errors := [...]ErrorResponse{
            ErrorResponse{"connection reset by peer", "R", 5},
            ErrorResponse{"EOF", "E", 5},
            ErrorResponse{"timeout", "T", 2},
        }

        for _, EResponse := range Errors {
            if strings.HasSuffix(err.Error(), EResponse.ErrorMessage) {
                fmt.Print(EResponse.PrintMessage)
                t := time.Duration(EResponse.SleepDuration) * time.Second // sleep to maybe help with DOS prevention and recover from err
                art.retries++
                time.Sleep(t)
                return GetUrl(art)
            }
        }
        panic(err)
    }
    return response.Body
}

Now you can elegantly add more errors for future editing/scaling without adding ugly if checks. I left the PrintMessage s ("R", "E", "T") as they are for you to edit them with more meaningful message for the user as they are clearly useless in their current state.

Is string comparison really the only way to handle errors from this library? It seems very odd to me. Most libraries will have their own error types, and yet I couldn't discover what this libraries were when I looked at the documentation. — Keozon
– Keozon, Commented Jul 8, 2016 at 19:37

Stack Exchange Network

Recursive Web Crawler in Go

1 Answer 1

You must log in to answer this question.

Hot Network Questions

Recursive Web Crawler in Go

1 Answer 1

You must log in to answer this question.

Related

Hot Network Questions