This is probably my third Go application. It essentially takes one or two command line arguments of wikipedia articles and pulls every /wiki/ link that isn't a special page, memoizes them to avoid loading the same page twice, and sees how many 'clicks' it takes to get from the first article to the target article.
As this is only my third Go application, I'm still very new to Go's style, I feel I'm definitely missing something about the error interface and overall it seems a bit messy. Any feedback from experienced Gophers, large or small, would be greatly appreciated.
package main
import (
"fmt"
"golang.org/x/net/html"
"io"
"net/http"
"os"
"strings"
"time"
)
type Article struct {
name string
url string
parent *Article
retries uint
}
func main() {
var target string
var source string
args := os.Args[1:]
if len(args) == 0 {
fmt.Println("Usage: wikirace SOURCE [DESTINATION]")
fmt.Println("If DESTINATION is ommited, SOURCE will be used as DESTINATION and SOURCE will be a random article")
fmt.Println("Format of articles should be either '/wiki/ARTICLENAME' or just 'ARTICLENAME'")
return
} else if len(args) == 1 {
target = args[0]
source = ""
} else {
source = args[0]
target = args[1]
}
if !strings.HasPrefix(target, "/wiki/") {
target = "/wiki/" + target
}
foundChannel := make(chan Article) // indicates target is found when written to
urlChannel := make(chan Article) // indicates a new URL needs loaded
memo := make([]string, 0) // Slice stores already-visited pages
buffer := make([]Article, 0, 2000) // Stores pages that need to be loaded
count := 0 // Counts currently waiting socket connections to limit file descriptors
tracker := make(map[string]int) // Hash map tracks pages that have been requested, but received no results
if source == "" {
source = "Special:Random"
}
if !strings.HasPrefix(source, "/wiki/") {
source = "/wiki/" + source
}
start := Article{source, "http://en.wikipedia.org" + source, nil, 0}
count++
tracker[source] = 1
fmt.Print("Searching...")
go LoadPage(start, target, foundChannel, urlChannel)
// Wait on channels, defaulting to shifting items off the buffer stack
for {
select {
case art := <-urlChannel:
art.url = "http://en.wikipedia.org" + art.name
_, present := tracker[art.parent.name] // check to see if the parent name is present in the tracker
if present {
delete(tracker, art.parent.name) // delete the parent's name, as that connection has closed
count-- // decrement count so we can use another connection
}
buffer = append(buffer, art)
case art := <-foundChannel: // this means the target article was found
fmt.Println()
fmt.Println("Found target URL in article", art.name)
fmt.Println(len(memo), "unique articles searched")
fmt.Println("Clicks to reach target:")
thisArticle := Article{target, "", &art, 0} // create an article for the target, so it prints nicely
path := walkParents(thisArticle) // get an array of all parents
top := len(path)
for i := top - 1; i > -1; i-- { // then print them in reverse order
fmt.Println(top-(i+1), path[i])
}
return
default:
if count < 1000 && len(buffer) > 0 {
next := buffer[0] // Take the zeroth element
buffer = buffer[1:] // and remove it from the buffer (FIFO)
// fmt.Println("Loading page", next.name)
if !contains(&memo, next.url) {
count++ // increment the counter to keep from overflowing file descriptors
tracker[next.name] = 1 // record the URL so count can be decremented appropriately
memo = append(memo, next.url)
go LoadPage(next, target, foundChannel, urlChannel)
}
}
}
}
}
func walkParents(art Article) []string {
array := make([]string, 0, 30)
parent := &art
for {
if parent != nil {
array = append(array, parent.name)
parent = parent.parent
} else {
return array
}
}
}
func LoadPage(art Article, target string, found chan Article, c chan Article) {
fmt.Print(".")
t := time.Duration(1) * time.Second // sleep to maybe help with DOS prevention
time.Sleep(t)
resp := GetUrl(&art)
urls := FindMainContentLinks(resp)
for _, value := range urls {
// fmt.Println("Found link to page:", string(value))
if value == target {
found <- art
}
new := Article{string(value), "", &art, 0}
c <- new
}
return
}
func GetUrl(art *Article) io.ReadCloser {
response, err := http.Get(art.url)
if err != nil {
if art.retries > 2 {
panic(err)
}
if strings.HasSuffix(err.Error(), "connection reset by peer") {
fmt.Print("R")
t := time.Duration(5) * time.Second // sleep to maybe help with DOS prevention and recover from err
art.retries++
time.Sleep(t)
return GetUrl(art)
} else if strings.HasSuffix(err.Error(), "EOF") {
fmt.Print("E")
t := time.Duration(5) * time.Second // sleep to maybe help with DOS prevention and recover from err
art.retries++
time.Sleep(t)
return GetUrl(art)
} else if strings.HasSuffix(err.Error(), "timeout") {
fmt.Print("T")
t := time.Duration(2) * time.Second // sleep to maybe help with DOS prevention and recover from err
art.retries++
time.Sleep(t)
return GetUrl(art)
} else {
panic(err)
}
}
return response.Body
}
func FindMainContentLinks(body io.ReadCloser) []string {
tokenizer := html.NewTokenizer(body)
urls := make([]string, 0)
for {
token := tokenizer.Next()
switch {
case token == html.ErrorToken:
body.Close()
return urls
case token == html.StartTagToken:
tag := tokenizer.Token()
if tag.Data == "a" {
for _, attr := range tag.Attr {
value := attr.Val
if attr.Key == "href" {
if strings.HasPrefix(value, "/wiki/") && !strings.Contains(value, ":") && !strings.HasSuffix(value, "Main_Page") {
urls = append(urls, value)
}
}
}
}
}
}
}
func contains(a *[]string, str string) bool {
for _, value := range *a {
if value == str {
return true
}
}
return false
}