· 6 years ago · Dec 11, 2019, 12:22 PM
1package main
2
3import (
4 "fmt"
5 "golang.org/x/net/html"
6 "net/http"
7 "strings"
8)
9
10func getChildren(node *html.Node) []*html.Node {
11 var children []*html.Node
12 for c := node.FirstChild; c != nil; c = c.NextSibling {
13 children = append(children, c)
14 }
15 return children
16}
17
18func getAttr(node *html.Node, key string) string {
19 for _, attr := range node.Attr {
20 if attr.Key == key {
21 return attr.Val
22 }
23 }
24 return ""
25}
26
27func isText(node *html.Node) bool {
28 return node != nil && node.Type == html.TextNode
29}
30
31func isElem(node *html.Node, tag string) bool {
32 return node != nil && node.Type == html.ElementNode && node.Data == tag
33}
34
35func isElemWithClass(node *html.Node, tag string, class string) bool {
36 return isElem(node, tag) && strings.Contains(getAttr(node, "class"), class)
37}
38
39func isDiv(node *html.Node, class string) bool{
40 return isElemWithClass(node, "div", class)
41}
42
43func readItem(item *html.Node) *Item {
44 if getAttr(item, "class") == "first-item" {
45 return readItem(getChildren(item)[1])
46 }
47 if a := item.FirstChild; isElem(a, "a") {
48 if cs := getChildren(a); len(cs) == 2 && isElem(cs[0], "time") && isText(cs[1]) {
49 return &Item{
50 Ref: getAttr(a, "href"),
51 Time: getAttr(cs[0], "datetime"),
52 Title: cs[1].Data,
53 }
54 }
55 if cs := getChildren(a); len(cs) == 1 && isText(cs[0]) {
56 return &Item{
57 Ref: getAttr(a, "href"),
58 Time: "",
59 Title: cs[0].Data,
60 }
61 }
62 }
63 return nil
64}
65
66type Item struct {
67 Ref, Time, Title string
68}
69
70
71func downloadNews(url string) *html.Node {
72 if response, err := http.Get(url); err != nil {
73 fmt.Println("request to" ,url, "failed", "error", err)
74 } else {
75 defer response.Body.Close()
76 status := response.StatusCode
77 if status == http.StatusOK {
78 if doc, err := html.Parse(response.Body); err != nil {
79 fmt.Println("invalid HTML from", url, "error", err)
80 } else {
81 return doc
82 }
83 }
84 }
85 return nil
86}
87
88func search(node *html.Node, tag string, class string) []*html.Node {
89 var nodes []*html.Node
90 if isElemWithClass(node, tag, class) {
91 nodes = append(nodes, node)
92 return nodes
93 }
94 for c := node.FirstChild; c != nil; c = c.NextSibling {
95 if s := search(c, tag, class); s != nil {
96 nodes = append(nodes, s...)
97 }
98 }
99 if len(nodes) == 0 {
100 return nil
101 }
102 return nodes
103}
104
105
106
107//===================================================================================================
108
109
110
111func main() {
112 fmt.Printf("%53s%s\n\n", "", "LENTA.RU")
113 document := downloadNews("http://lenta.ru")
114 {
115 nodes := search(document, "div", "b-yellow-box__wrap")
116 for _, node := range nodes {
117 for c := node.FirstChild; c != nil; c = c.NextSibling {
118 if isDiv(c, "item"){
119 if item := readItem(c); item != nil {
120 fmt.Printf("%53s%s\n", "", item.Title)
121 }
122 }
123 }
124 }
125 }
126 {
127 nodes := search(document, "div", "span4")
128 for _, node := range nodes {
129 for c := node.FirstChild; c != nil; c = c.NextSibling {
130 if isDiv(c, "item") || isDiv(c, "first-item") {
131 if item := readItem(c); item != nil {
132 fmt.Printf("%50s - %s\n", item.Time, item.Title)
133 }
134 }
135 }
136 }
137 }
138
139 fmt.Printf("\n%53s%s\n\n", "", "M24.RU")
140 document = downloadNews("https://www.m24.ru/news")
141 {
142 nodes := search(document, "div", "b-materials-list")
143 node := nodes[0]
144 var date string = ""
145 for c := node.FirstChild; c != nil; c = c.NextSibling {
146 if isElem(c, "p"){
147 date = c.FirstChild.Data
148 } else if isElem(c, "ul"){
149 items := search(c, "li", "")
150 for _, item := range items {
151 titleNode := search(item, "p", "b-materials-list__title")[0]
152 timeNode := search(titleNode, "span", "b-materials-list__date")[0]
153 titleNode = search(titleNode, "a", "")[0]
154 title := strings.ReplaceAll(getChildren(titleNode)[2].Data, "\n\t\t", "")
155 time := timeNode.FirstChild.Data
156 fmt.Printf("%50s - %s\n", date + " " + time, title)
157 }
158 }
159 }
160 }
161
162 fmt.Printf("\n%53s%s\n\n", "", "YANDEX.RU")
163 document = downloadNews("https://yandex.ru/news")
164 {
165 nodes := search(document, "div", "story")
166 for _, node := range nodes{
167 titleNode := search(node, "h2", "story__title")[0].FirstChild
168 infoNode := search(node, "div", "story__date")[0]
169 title := titleNode.FirstChild.Data
170 info := infoNode.FirstChild.Data
171 fmt.Printf("%50s - %s\n", info, title)
172 }
173 }
174}