Skip to content
This repository was archived by the owner on Dec 13, 2022. It is now read-only.

Commit 07daeb1

Browse files
committed
Resolve relative links on pages that don't end with a /
Fixes #8
1 parent 9c182c0 commit 07daeb1

File tree

2 files changed

+33
-2
lines changed

2 files changed

+33
-2
lines changed

httpsyet/crawler.go

Lines changed: 32 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -115,6 +115,9 @@ func (c Crawler) validate() error {
115115
// TODO: how are we handling javascript: and so on?
116116
// what about URL without http:// ? Like example.com or 127.0.0.1:1234
117117
func isRelativeWithoutSlash(s string) bool {
118+
if strings.HasPrefix(s, "#") {
119+
return false
120+
}
118121
if strings.HasPrefix(s, "http://") {
119122
return false
120123
}
@@ -139,6 +142,29 @@ func isRelativeWithoutSlash(s string) bool {
139142
return true
140143
}
141144

145+
func ensureTrailingSlash(u *url.URL) *url.URL {
146+
if len(u.Path) == 0 {
147+
return u
148+
}
149+
parts := strings.Split(u.Path, "/")
150+
if len(parts) == 0 {
151+
return u
152+
}
153+
// No trailing slash if there is a file extension
154+
if strings.Contains(parts[len(parts)-1], ".") {
155+
return u
156+
}
157+
if u.Path[len(u.Path)-1] == '/' {
158+
return u
159+
}
160+
uNew, err := url.Parse(u.String())
161+
if err != nil {
162+
panic(err)
163+
}
164+
uNew.Path = uNew.Path + "/"
165+
return uNew
166+
}
167+
142168
// Returns a list of only valid URLs.
143169
// Invalid protocols such as mailto or javascript are ignored.
144170
// The returned error shows all invalid URLs in one message.
@@ -153,6 +179,8 @@ func toURLs(links []string, parse func(string) (*url.URL, error)) (urls []*url.U
153179
invalids = append(invalids, fmt.Sprintf("%s (%v)", s, e))
154180
continue
155181
}
182+
// Remove #hash
183+
u.Fragment = ""
156184
// Default to https
157185
if u.Scheme == "" {
158186
u.Scheme = "https"
@@ -237,7 +265,10 @@ func (c Crawler) worker(
237265
results <- fmt.Sprintf("%v %v", s.Parent, s.URL.String())
238266
}
239267

240-
urls, err := toURLs(links, s.URL.Parse)
268+
// Ensure we can resolve relative paths properly
269+
urlWithTrailingSlash := ensureTrailingSlash(s.URL)
270+
271+
urls, err := toURLs(links, urlWithTrailingSlash.Parse)
241272
if err != nil {
242273
c.Log.Printf("page %v: %v\n", s.URL, err)
243274
}

httpsyet/crawler_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@ const (
5555
This is an internal link which should result in a 404
5656
</a>
5757
58-
<a href="sub/">
58+
<a href="/sub/">
5959
This is an relative internal link to a page without children.
6060
</a>
6161

0 commit comments

Comments
 (0)