@@ -115,6 +115,9 @@ func (c Crawler) validate() error {
115115// TODO: how are we handling javascript: and so on?
116116// what about URL without http:// ? Like example.com or 127.0.0.1:1234
117117func isRelativeWithoutSlash (s string ) bool {
118+ if strings .HasPrefix (s , "#" ) {
119+ return false
120+ }
118121 if strings .HasPrefix (s , "http://" ) {
119122 return false
120123 }
@@ -139,6 +142,29 @@ func isRelativeWithoutSlash(s string) bool {
139142 return true
140143}
141144
145+ func ensureTrailingSlash (u * url.URL ) * url.URL {
146+ if len (u .Path ) == 0 {
147+ return u
148+ }
149+ parts := strings .Split (u .Path , "/" )
150+ if len (parts ) == 0 {
151+ return u
152+ }
153+ // No trailing slash if there is a file extension
154+ if strings .Contains (parts [len (parts )- 1 ], "." ) {
155+ return u
156+ }
157+ if u .Path [len (u .Path )- 1 ] == '/' {
158+ return u
159+ }
160+ uNew , err := url .Parse (u .String ())
161+ if err != nil {
162+ panic (err )
163+ }
164+ uNew .Path = uNew .Path + "/"
165+ return uNew
166+ }
167+
142168// Returns a list of only valid URLs.
143169// Invalid protocols such as mailto or javascript are ignored.
144170// The returned error shows all invalid URLs in one message.
@@ -153,6 +179,8 @@ func toURLs(links []string, parse func(string) (*url.URL, error)) (urls []*url.U
153179 invalids = append (invalids , fmt .Sprintf ("%s (%v)" , s , e ))
154180 continue
155181 }
182+ // Remove #hash
183+ u .Fragment = ""
156184 // Default to https
157185 if u .Scheme == "" {
158186 u .Scheme = "https"
@@ -237,7 +265,10 @@ func (c Crawler) worker(
237265 results <- fmt .Sprintf ("%v %v" , s .Parent , s .URL .String ())
238266 }
239267
240- urls , err := toURLs (links , s .URL .Parse )
268+ // Ensure we can resolve relative paths properly
269+ urlWithTrailingSlash := ensureTrailingSlash (s .URL )
270+
271+ urls , err := toURLs (links , urlWithTrailingSlash .Parse )
241272 if err != nil {
242273 c .Log .Printf ("page %v: %v\n " , s .URL , err )
243274 }
0 commit comments