|
1 | 1 | package minidoc
|
2 | 2 |
|
3 | 3 | import (
|
| 4 | + "bytes" |
4 | 5 | "fmt"
|
| 6 | + "golang.org/x/net/html" |
5 | 7 | "io/ioutil"
|
6 | 8 | "net/http"
|
7 | 9 | "os"
|
8 | 10 | "os/exec"
|
| 11 | + "strings" |
9 | 12 | "syscall"
|
10 | 13 | "time"
|
| 14 | + |
| 15 | + xmlpath "gopkg.in/xmlpath.v2" |
11 | 16 | )
|
12 | 17 |
|
13 | 18 | func OpenFileIfNoneExist(filepath, content string) error {
|
@@ -161,3 +166,65 @@ func HTTPGet(url string) ([]byte, error) {
|
161 | 166 |
|
162 | 167 | return data, err
|
163 | 168 | }
|
| 169 | + |
| 170 | +// ScreenScrape hits the given URL and screen scrape then return dom like object for searching |
| 171 | +func ScreenScrape(url string) (*xmlpath.Node, error) { |
| 172 | + |
| 173 | + client := http.Client{ |
| 174 | + Timeout: 3 * time.Second, |
| 175 | + } |
| 176 | + |
| 177 | + resp, err := client.Get(url) |
| 178 | + if err != nil { |
| 179 | + return nil, err |
| 180 | + } |
| 181 | + defer resp.Body.Close() |
| 182 | + if resp.StatusCode != 200 { |
| 183 | + return nil, fmt.Errorf("failed") |
| 184 | + } |
| 185 | + |
| 186 | + pageContent, err := ioutil.ReadAll(resp.Body) |
| 187 | + |
| 188 | + reader := strings.NewReader(string(pageContent)) |
| 189 | + root, err := html.Parse(reader) |
| 190 | + if err != nil { |
| 191 | + log.Fatal(err) |
| 192 | + } |
| 193 | + |
| 194 | + var b bytes.Buffer |
| 195 | + html.Render(&b, root) |
| 196 | + fixedHTML := b.String() |
| 197 | + |
| 198 | + reader = strings.NewReader(fixedHTML) |
| 199 | + xmlroot, xmlerr := xmlpath.ParseHTML(reader) |
| 200 | + |
| 201 | + if xmlerr != nil { |
| 202 | + log.Fatal(xmlerr) |
| 203 | + } |
| 204 | + |
| 205 | + return xmlroot, nil |
| 206 | +} |
| 207 | + |
| 208 | +// SearchByXPath will walk down the node and children using xpath expression |
| 209 | +func SearchByXPath(context *xmlpath.Node, xpath string) []*xmlpath.Node { |
| 210 | + path := xmlpath.MustCompile(xpath) |
| 211 | + |
| 212 | + nodes := make([]*xmlpath.Node, 0, 100) |
| 213 | + |
| 214 | + iter := path.Iter(context) |
| 215 | + for iter.Next() { |
| 216 | + nodes = append(nodes, iter.Node()) |
| 217 | + } |
| 218 | + |
| 219 | + return nodes |
| 220 | +} |
| 221 | + |
| 222 | +// XPathGet xpath get by index |
| 223 | +func XPathGet(context *xmlpath.Node, xpath string, index int) string { |
| 224 | + nodes := SearchByXPath(context, xpath) |
| 225 | + if index >= len(nodes) { |
| 226 | + fmt.Println("failed to get ", xpath, " index:", index) |
| 227 | + return "" |
| 228 | + } |
| 229 | + return strings.TrimSpace(nodes[index].String()) |
| 230 | +} |
0 commit comments