解析数据

Jsoup 解析 html 标签

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.junit.Test;

import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class Test {

// http://www.runoob.com/java/java-regular-expressions.html
@Test
public void testRegex() {
String input = "【丝路亚心】250gX4<font color=\"red\">核桃仁</font> 原味生<font color=\"red\">核桃核桃仁</font> 新疆特产, <font color=\"red\">123445</font>bcdef";
String regex = "<font[^>]+?>(.*?)<\\\\/font>";
Pattern compile = Pattern.compile(regex);
Matcher matcher = compile.matcher(input);
List<String> output = new ArrayList<>();
int count = 0;
while (matcher.find()) {
System.out.println("--" + count);
output.add(matcher.group(count));
count++;
}

System.out.println(output);
}

@Test
public void testHtml() {

String html = "<span style=\"color:red;border-radius:10px;background-color:blue;\">抢购</span> 我是标题我是标题我是标题<font color=\"red\">高亮</font>我是标题我是标题我是标题我是标题<font color=\"red\">高亮</font>我是标题";

Document doc = Jsoup.parse(html);
System.out.println(doc); // 输出带标签的html文档
System.out.println("\n---->" + doc.text()); // 输出内容
System.out.println("\n---->" + doc.getElementsByTag("span").get(0).html());
System.out.println("\n---->" + doc.getElementsByTag("font").get(0).html());

}

@Test
public void testStyle() {
String style = "position: absolute; width: 500px; height: 552px; color: red; background-color: blue;";
String width = "width";
String color = "color";
String backgroundColor = "background-color";
System.out.println(getFloat(style, width));
System.out.println(getString(style, color));
System.out.println(getString(style, backgroundColor));
}

// https://www.cnblogs.com/qlqwjy/p/7531579.html
public static double getFloat(String value, String property) {
try {
if (value.contains(property)) {
value = value.substring(value.indexOf(property));
value = value.substring(0, value.contains(";") ? value.indexOf(";") : value.length());
String attr = value.substring(value.indexOf(":") + 1).trim();
return Double.parseDouble(attr.substring(0, attr.indexOf("px")));
}
} catch (Exception e) {
// ignore all exception
}

return 0;
}

public static String getString(String value, String property) {
try {
if (value.contains(property)) {
System.out.println(value);
value = value.substring(value.indexOf(property));
value = value.substring(0, value.contains(";") ? value.indexOf(";") : value.length());
return value.substring(value.indexOf(":") + 1).trim();
}
} catch (Exception e) {
// ignore all exception
}

return "";
}
}