Skip to content

Commit 5b9b23e

Browse files
committed
Merge branch 'master' of https://github.com/wade6/codeRepository
2 parents 98f0b0e + a80240a commit 5b9b23e

1 file changed

Lines changed: 148 additions & 85 deletions

File tree

Lines changed: 148 additions & 85 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,15 @@
11
package com.code.repository.study.http;
22

3+
import com.alibaba.fastjson.JSON;
4+
import com.code.repository.util.JsonStringUtil;
35
import org.apache.commons.lang3.StringUtils;
46
import org.apache.http.HttpEntity;
57
import org.apache.http.HttpResponse;
68
import org.apache.http.client.HttpClient;
79
import org.apache.http.client.methods.HttpGet;
810
import org.apache.http.impl.client.HttpClients;
911
import org.apache.http.util.EntityUtils;
12+
import org.assertj.core.util.Lists;
1013
import org.jsoup.Jsoup;
1114
import org.jsoup.nodes.Document;
1215
import org.jsoup.nodes.Element;
@@ -19,115 +22,127 @@
1922
import java.net.URI;
2023
import java.util.ArrayList;
2124
import java.util.HashMap;
25+
import java.util.List;
2226
import java.util.Map;
23-
import java.util.concurrent.Callable;
24-
import java.util.concurrent.ExecutorService;
25-
import java.util.concurrent.Executors;
26-
import java.util.concurrent.Future;
27+
import java.util.concurrent.*;
2728

2829
/**
2930
* @Author zhaoyuan.lizy on 2019/9/5
3031
**/
3132

32-
public class HscodeGatherer implements Runnable {
33+
public class HscodeGatherer implements Callable<List<Map<String,String>>> {
3334

34-
private int start=0;
35+
private String hscode4 = null;
3536

36-
private int end=0;
37-
38-
HscodeGatherer(int start,int end){
39-
this.start = start;
40-
this.end = end;
37+
HscodeGatherer(String hscode4){
38+
this.hscode4 = hscode4;
4139
}
4240

41+
private static int corePoolSize = 20; // 主流线程个数
42+
private static int maximumPoolSize = 50; // 线程最大个数
43+
private static long keepAliveTime = 1000L; // 大于主流线程个数的线程空闲的过期时间 wait for new tasks before terminating
44+
private static TimeUnit unit = TimeUnit.MILLISECONDS; // 时间单元
45+
private static BlockingQueue<Runnable> workQueue = new LinkedBlockingQueue<Runnable>(); // 工作队列,有三种类SynchronousQueue、LinkedBlockingQueue(在所有 corePoolSize 线程都忙时新任务在队列中等待,maximumPoolSiz失效)、ArrayBlockingQueue,分别对应同步队列、无界队列、有界队列。
4346

44-
public static void main(String[] args){
47+
private static final ThreadPoolExecutor executorPool = new ThreadPoolExecutor(corePoolSize, maximumPoolSize, keepAliveTime, unit, workQueue);
4548

46-
// 多线程
47-
// ExecutorService executor = Executors.newCachedThreadPool();
48-
// try{
49-
// executor.submit(new HscodeGatherer(62,63));
50-
// } catch(Exception e){
51-
// }
5249

53-
// 单线程
54-
HscodeGatherer.fetchScope(62,62);
50+
public static void main(String[] args){
51+
52+
// // 多线程
53+
ArrayList<Future<List<Map<String, String>>>> results = new ArrayList<Future<List<Map<String, String>>>>();
54+
try{
55+
int start1=1;
56+
for(;start1<=97;start1++) {
57+
int start2=1;
58+
for (; start2 <= 99; start2++) {
59+
String hscode4 = HscodeGatherer.numToStr(start1)+HscodeGatherer.numToStr(start2);
60+
System.out.println(hscode4);
61+
results.add(executorPool.submit(new HscodeGatherer(hscode4)));
62+
// System.out.println("====thread num:"+executorPool.getActiveCount());
63+
}
64+
}
65+
int count =1;
66+
for(Future<List<Map<String, String>>> future : results){
67+
List<Map<String, String>> result = future.get();
68+
if(result!=null && result.size()>0){
69+
HscodeGatherer.writeToFile(result);// 写文件
70+
}
71+
System.out.println("=====finish num:"+count++);
72+
}
73+
System.out.println("=====finish all:"+count++);
74+
return;
75+
} catch(Exception e){
76+
}
5577

5678

5779
// 单个code
58-
// String url = "https://www.customs.gov.vn/SitePages/Tariff-Search.aspx?portlet=DetailsImportTax&language=en-US&code=";
59-
// String hscode = "62043300";
60-
// Map<String,String> result = HscodeGatherer.fetchInfo(url+hscode); // 爬取数据
61-
// result.put("hscode",hscode); // 填充hscode
62-
// System.out.println(JSON.toJSONString(result));
80+
// List<String> hscode8List = HscodeGatherer.fetchHscode8("0101");
81+
// List<Map<String, String>> result = Lists.newArrayList();
82+
// for(String hscode8 : hscode8List){
83+
// result.add(HscodeGatherer.parseHscode8Detail(hscode8));
84+
// }
6385
// HscodeGatherer.writeToFile(result);// 写文件
6486
}
6587

66-
@Override
67-
public void run() {
68-
this.fetchScope(start,end);
69-
}
7088

71-
private static void fetchScope(int start, int end){ // 开头两位 01-97
72-
if(start<=0){
73-
start=1;
74-
}
75-
if(end>97){
76-
end=97;
77-
}
78-
int start1 = start;
79-
int start2 = 1;
80-
int start3 = 1;
81-
int start4 = 0;
82-
for(;start1<=end;start1++){
83-
for(;start2 <=97;start2++){
84-
for(;start3 <=99;start3++){
85-
for(;start4 <=99;start4++){ // 生成8位
86-
String hscode = HscodeGatherer.numToStr(start1)+HscodeGatherer.numToStr(start2)+HscodeGatherer.numToStr(start3)+HscodeGatherer.numToStr(start4);
87-
// String hscode = "62043300";
88-
System.out.println("=====hscode:"+hscode);
89-
String url = "https://www.customs.gov.vn/SitePages/Tariff-Search.aspx?portlet=DetailsImportTax&language=en-US&code=";
90-
Map<String,String> result = HscodeGatherer.fetchInfo(url+hscode); // 爬取数据
91-
if(result == null){
92-
System.out.println("=====no result:");
93-
continue;
94-
}
95-
System.out.println("==================================ok:");
96-
result.put("hscode",hscode); // 填充hscode
97-
// System.out.println(JSON.toJSONString(result));
98-
HscodeGatherer.writeToFile(result,start,end);// 写文件
99-
}
100-
}
101-
}
89+
private static String numToStr(int num){
90+
if(num<9){
91+
return "0"+num;
92+
}else{
93+
return String.valueOf(num);
10294
}
10395
}
10496

105-
// 写文件
106-
private static void writeToFile(Map<String,String> result,int start, int end){
107-
String fileName = "D:\\hscodeTH"+start+"_"+end+".txt";
108-
BufferedWriter out = null;
109-
try {
110-
OutputStreamWriter ow = new OutputStreamWriter(new FileOutputStream(new File(fileName),true),"UTF-8");
111-
out = new BufferedWriter(ow);
112-
out.newLine();
113-
out.write(result.get("hscode")+";"+result.get("Favour")+";"+result.get("ASEAN - China  (ACFTA)"));
114-
out.flush();
115-
out.close();
116-
} catch (Exception e) {
117-
}finally{
97+
@Override
98+
public List<Map<String, String>> call() throws Exception {
99+
System.out.println("=== start:"+this.hscode4);
100+
// 获取8位hscode
101+
List<String> hscode8List = HscodeGatherer.fetchHscode8(this.hscode4);
102+
// 获取hsocde税率
103+
List<Map<String, String>> result = Lists.newArrayList();
104+
for(String hscode8 : hscode8List){
105+
result.add(HscodeGatherer.parseHscode8Detail(hscode8));
118106
}
107+
System.out.println("=== finish:"+this.hscode4+",hscode:"+JSON.toJSONString(hscode8List));
108+
// 返回结果
109+
return result;
119110
}
120111

121-
private static String numToStr(int num){
122-
if(num<9){
123-
return "0"+num;
124-
}else{
125-
return String.valueOf(num);
112+
113+
/**
114+
* 根据4位hscode 获取8位hscode
115+
*/
116+
private static List<String> fetchHscode8(String hscode4){
117+
String url = "https://www.customs.gov.vn/SitePages/Tariff-Search.aspx?portlet=Structure&language=en-US&tariff="+hscode4;
118+
String html = HscodeGatherer.doGet(url); // 获取内容
119+
if(StringUtils.isBlank(html)){
120+
return null ;
126121
}
122+
List<String> hscodeList = Lists.newArrayList();
123+
Document doc = Jsoup.parse(html);// 结构化解析
124+
Elements tables = doc.getElementsByClass("tariffList");
125+
for(Element eTable : tables){
126+
Elements trs = eTable.select("tr");
127+
if(trs.size()<5){
128+
continue;
129+
}
130+
for(Element tr : trs){
131+
if(tr.toString().contains("DetailsImportTax")){// 8位hscode
132+
String hscode8 = tr.select("a").get(0).text();
133+
hscodeList.add(hscode8);
134+
}
135+
}
136+
}
137+
return hscodeList;
127138
}
128139

129-
private static Map<String,String> fetchInfo(String url){
130140

141+
/**
142+
* 获取8位hscode税率,forme
143+
*/
144+
private static Map<String,String> parseHscode8Detail(String hscode8){
145+
String url = "https://www.customs.gov.vn/SitePages/Tariff-Search.aspx?portlet=DetailsImportTax&language=en-US&code="+hscode8;
131146
String html = HscodeGatherer.doGet(url);
132147
if(StringUtils.isBlank(html)){
133148
return null;
@@ -137,10 +152,17 @@ private static Map<String,String> fetchInfo(String url){
137152
return null;
138153
}
139154
Document doc = Jsoup.parse(html);// 结构化解析
140-
Elements tables = doc.select("table");
155+
Elements tables = doc.getElementsByClass("tariffList");
156+
Map<String,String> result = new HashMap<>();
141157
for(Element eTable : tables){
158+
if (eTable.text().contains("Description")) { // 描述
159+
Elements trs = eTable.select("tr");
160+
if(trs.size()>=6){
161+
Elements tds = trs.get(5).select("td");
162+
result.put("desc",tds.get(1).text().replaceAll("-","#"));
163+
}
164+
}
142165
if (eTable.text().contains("Tax rate")) {// 锁定税率列表,提取各种税率
143-
Map<String,String> result = new HashMap<>();
144166
Elements trs = eTable.select("tr");
145167
for(Element tr:trs){
146168
Elements tds = tr.select("td");
@@ -151,14 +173,56 @@ private static Map<String,String> fetchInfo(String url){
151173
result.put(tds.get(0).text(),tds.get(1).text());
152174
}
153175
}
154-
return result;
176+
result.put("hscode",hscode8); // 填充hscode
177+
// System.out.println("=======hscode8 detail:"+JSON.toJSONString(result));
178+
}
179+
}
180+
return result;
181+
}
182+
183+
// 写文件
184+
private static void writeToFile(Map<String,String> result){
185+
String fileName = "D:\\hscodeTH.txt";
186+
BufferedWriter out = null;
187+
try {
188+
OutputStreamWriter ow = new OutputStreamWriter(new FileOutputStream(new File(fileName),true));
189+
out = new BufferedWriter(ow);
190+
out.newLine();
191+
if(StringUtils.isBlank(result.get("ASEAN - China  (ACFTA)"))){
192+
out.write(result.get("hscode")+";"+result.get("Favour")+";Na"+";"+result.get("desc"));
155193
}else{
156-
continue;
194+
out.write(result.get("hscode")+";"+result.get("Favour")+";"+result.get("ASEAN - China  (ACFTA)")+";"+result.get("desc"));
157195
}
196+
out.flush();
197+
out.close();
198+
} catch (Exception e) {
199+
}finally{
158200
}
159-
return null;
160201
}
161202

203+
// 写文件
204+
private static void writeToFile(List<Map<String,String>> resultList){
205+
String fileName = "D:\\hscodeTH.txt";
206+
BufferedWriter out = null;
207+
try {
208+
OutputStreamWriter ow = new OutputStreamWriter(new FileOutputStream(new File(fileName),true),"GBK");
209+
out = new BufferedWriter(ow);
210+
for(Map<String,String> result : resultList){
211+
out.newLine();
212+
if(StringUtils.isBlank(result.get("ASEAN - China  (ACFTA)"))){
213+
out.write(result.get("hscode")+";"+result.get("Favour")+";Na"+";"+result.get("desc"));
214+
}else{
215+
out.write(result.get("hscode")+";"+result.get("Favour")+";"+result.get("ASEAN - China  (ACFTA)")+";"+result.get("desc"));
216+
}
217+
}
218+
out.flush();
219+
out.close();
220+
} catch (Exception e) {
221+
}finally{
222+
}
223+
}
224+
225+
// 发起get请求
162226
private static String doGet(String url){
163227
HttpClient client = HttpClients.createDefault();
164228
HttpGet request = new HttpGet();
@@ -180,5 +244,4 @@ private static String doGet(String url){
180244
return null;
181245
}
182246

183-
184247
}

0 commit comments

Comments
 (0)