11package com .code .repository .study .http ;
22
3+ import com .alibaba .fastjson .JSON ;
4+ import com .code .repository .util .JsonStringUtil ;
35import org .apache .commons .lang3 .StringUtils ;
46import org .apache .http .HttpEntity ;
57import org .apache .http .HttpResponse ;
68import org .apache .http .client .HttpClient ;
79import org .apache .http .client .methods .HttpGet ;
810import org .apache .http .impl .client .HttpClients ;
911import org .apache .http .util .EntityUtils ;
12+ import org .assertj .core .util .Lists ;
1013import org .jsoup .Jsoup ;
1114import org .jsoup .nodes .Document ;
1215import org .jsoup .nodes .Element ;
1922import java .net .URI ;
2023import java .util .ArrayList ;
2124import java .util .HashMap ;
25+ import java .util .List ;
2226import java .util .Map ;
23- import java .util .concurrent .Callable ;
24- import java .util .concurrent .ExecutorService ;
25- import java .util .concurrent .Executors ;
26- import java .util .concurrent .Future ;
27+ import java .util .concurrent .*;
2728
2829/**
2930 * @Author zhaoyuan.lizy on 2019/9/5
3031 **/
3132
32- public class HscodeGatherer implements Runnable {
33+ public class HscodeGatherer implements Callable < List < Map < String , String >>> {
3334
34- private int start = 0 ;
35+ private String hscode4 = null ;
3536
36- private int end =0 ;
37-
38- HscodeGatherer (int start ,int end ){
39- this .start = start ;
40- this .end = end ;
37+ HscodeGatherer (String hscode4 ){
38+ this .hscode4 = hscode4 ;
4139 }
4240
41+ private static int corePoolSize = 20 ; // 主流线程个数
42+ private static int maximumPoolSize = 50 ; // 线程最大个数
43+ private static long keepAliveTime = 1000L ; // 大于主流线程个数的线程空闲的过期时间 wait for new tasks before terminating
44+ private static TimeUnit unit = TimeUnit .MILLISECONDS ; // 时间单元
45+ private static BlockingQueue <Runnable > workQueue = new LinkedBlockingQueue <Runnable >(); // 工作队列,有三种类SynchronousQueue、LinkedBlockingQueue(在所有 corePoolSize 线程都忙时新任务在队列中等待,maximumPoolSiz失效)、ArrayBlockingQueue,分别对应同步队列、无界队列、有界队列。
4346
44- public static void main ( String [] args ){
47+ private static final ThreadPoolExecutor executorPool = new ThreadPoolExecutor ( corePoolSize , maximumPoolSize , keepAliveTime , unit , workQueue );
4548
46- // 多线程
47- // ExecutorService executor = Executors.newCachedThreadPool();
48- // try{
49- // executor.submit(new HscodeGatherer(62,63));
50- // } catch(Exception e){
51- // }
5249
53- // 单线程
54- HscodeGatherer .fetchScope (62 ,62 );
50+ public static void main (String [] args ){
51+
52+ // // 多线程
53+ ArrayList <Future <List <Map <String , String >>>> results = new ArrayList <Future <List <Map <String , String >>>>();
54+ try {
55+ int start1 =1 ;
56+ for (;start1 <=97 ;start1 ++) {
57+ int start2 =1 ;
58+ for (; start2 <= 99 ; start2 ++) {
59+ String hscode4 = HscodeGatherer .numToStr (start1 )+HscodeGatherer .numToStr (start2 );
60+ System .out .println (hscode4 );
61+ results .add (executorPool .submit (new HscodeGatherer (hscode4 )));
62+ // System.out.println("====thread num:"+executorPool.getActiveCount());
63+ }
64+ }
65+ int count =1 ;
66+ for (Future <List <Map <String , String >>> future : results ){
67+ List <Map <String , String >> result = future .get ();
68+ if (result !=null && result .size ()>0 ){
69+ HscodeGatherer .writeToFile (result );// 写文件
70+ }
71+ System .out .println ("=====finish num:" +count ++);
72+ }
73+ System .out .println ("=====finish all:" +count ++);
74+ return ;
75+ } catch (Exception e ){
76+ }
5577
5678
5779 // 单个code
58- // String url = "https://www.customs.gov.vn/SitePages/Tariff-Search.aspx?portlet=DetailsImportTax&language=en-US&code=" ;
59- // String hscode = "62043300" ;
60- // Map< String,String> result = HscodeGatherer.fetchInfo(url+hscode); // 爬取数据
61- // result.put("hscode",hscode); // 填充hscode
62- // System.out.println(JSON.toJSONString(result));
80+ // List< String> hscode8List = HscodeGatherer.fetchHscode8("0101") ;
81+ // List<Map< String, String>> result = Lists.newArrayList() ;
82+ // for( String hscode8 : hscode8List){
83+ // result.add(HscodeGatherer.parseHscode8Detail(hscode8));
84+ // }
6385// HscodeGatherer.writeToFile(result);// 写文件
6486 }
6587
66- @ Override
67- public void run () {
68- this .fetchScope (start ,end );
69- }
7088
71- private static void fetchScope (int start , int end ){ // 开头两位 01-97
72- if (start <=0 ){
73- start =1 ;
74- }
75- if (end >97 ){
76- end =97 ;
77- }
78- int start1 = start ;
79- int start2 = 1 ;
80- int start3 = 1 ;
81- int start4 = 0 ;
82- for (;start1 <=end ;start1 ++){
83- for (;start2 <=97 ;start2 ++){
84- for (;start3 <=99 ;start3 ++){
85- for (;start4 <=99 ;start4 ++){ // 生成8位
86- String hscode = HscodeGatherer .numToStr (start1 )+HscodeGatherer .numToStr (start2 )+HscodeGatherer .numToStr (start3 )+HscodeGatherer .numToStr (start4 );
87- // String hscode = "62043300";
88- System .out .println ("=====hscode:" +hscode );
89- String url = "https://www.customs.gov.vn/SitePages/Tariff-Search.aspx?portlet=DetailsImportTax&language=en-US&code=" ;
90- Map <String ,String > result = HscodeGatherer .fetchInfo (url +hscode ); // 爬取数据
91- if (result == null ){
92- System .out .println ("=====no result:" );
93- continue ;
94- }
95- System .out .println ("==================================ok:" );
96- result .put ("hscode" ,hscode ); // 填充hscode
97- // System.out.println(JSON.toJSONString(result));
98- HscodeGatherer .writeToFile (result ,start ,end );// 写文件
99- }
100- }
101- }
89+ private static String numToStr (int num ){
90+ if (num <9 ){
91+ return "0" +num ;
92+ }else {
93+ return String .valueOf (num );
10294 }
10395 }
10496
105- // 写文件
106- private static void writeToFile (Map <String ,String > result ,int start , int end ){
107- String fileName = "D:\\ hscodeTH" +start +"_" +end +".txt" ;
108- BufferedWriter out = null ;
109- try {
110- OutputStreamWriter ow = new OutputStreamWriter (new FileOutputStream (new File (fileName ),true ),"UTF-8" );
111- out = new BufferedWriter (ow );
112- out .newLine ();
113- out .write (result .get ("hscode" )+";" +result .get ("Favour" )+";" +result .get ("ASEAN - China (ACFTA)" ));
114- out .flush ();
115- out .close ();
116- } catch (Exception e ) {
117- }finally {
97+ @ Override
98+ public List <Map <String , String >> call () throws Exception {
99+ System .out .println ("=== start:" +this .hscode4 );
100+ // 获取8位hscode
101+ List <String > hscode8List = HscodeGatherer .fetchHscode8 (this .hscode4 );
102+ // 获取hsocde税率
103+ List <Map <String , String >> result = Lists .newArrayList ();
104+ for (String hscode8 : hscode8List ){
105+ result .add (HscodeGatherer .parseHscode8Detail (hscode8 ));
118106 }
107+ System .out .println ("=== finish:" +this .hscode4 +",hscode:" +JSON .toJSONString (hscode8List ));
108+ // 返回结果
109+ return result ;
119110 }
120111
121- private static String numToStr (int num ){
122- if (num <9 ){
123- return "0" +num ;
124- }else {
125- return String .valueOf (num );
112+
113+ /**
114+ * 根据4位hscode 获取8位hscode
115+ */
116+ private static List <String > fetchHscode8 (String hscode4 ){
117+ String url = "https://www.customs.gov.vn/SitePages/Tariff-Search.aspx?portlet=Structure&language=en-US&tariff=" +hscode4 ;
118+ String html = HscodeGatherer .doGet (url ); // 获取内容
119+ if (StringUtils .isBlank (html )){
120+ return null ;
126121 }
122+ List <String > hscodeList = Lists .newArrayList ();
123+ Document doc = Jsoup .parse (html );// 结构化解析
124+ Elements tables = doc .getElementsByClass ("tariffList" );
125+ for (Element eTable : tables ){
126+ Elements trs = eTable .select ("tr" );
127+ if (trs .size ()<5 ){
128+ continue ;
129+ }
130+ for (Element tr : trs ){
131+ if (tr .toString ().contains ("DetailsImportTax" )){// 8位hscode
132+ String hscode8 = tr .select ("a" ).get (0 ).text ();
133+ hscodeList .add (hscode8 );
134+ }
135+ }
136+ }
137+ return hscodeList ;
127138 }
128139
129- private static Map <String ,String > fetchInfo (String url ){
130140
141+ /**
142+ * 获取8位hscode税率,forme
143+ */
144+ private static Map <String ,String > parseHscode8Detail (String hscode8 ){
145+ String url = "https://www.customs.gov.vn/SitePages/Tariff-Search.aspx?portlet=DetailsImportTax&language=en-US&code=" +hscode8 ;
131146 String html = HscodeGatherer .doGet (url );
132147 if (StringUtils .isBlank (html )){
133148 return null ;
@@ -137,10 +152,17 @@ private static Map<String,String> fetchInfo(String url){
137152 return null ;
138153 }
139154 Document doc = Jsoup .parse (html );// 结构化解析
140- Elements tables = doc .select ("table" );
155+ Elements tables = doc .getElementsByClass ("tariffList" );
156+ Map <String ,String > result = new HashMap <>();
141157 for (Element eTable : tables ){
158+ if (eTable .text ().contains ("Description" )) { // 描述
159+ Elements trs = eTable .select ("tr" );
160+ if (trs .size ()>=6 ){
161+ Elements tds = trs .get (5 ).select ("td" );
162+ result .put ("desc" ,tds .get (1 ).text ().replaceAll ("-" ,"#" ));
163+ }
164+ }
142165 if (eTable .text ().contains ("Tax rate" )) {// 锁定税率列表,提取各种税率
143- Map <String ,String > result = new HashMap <>();
144166 Elements trs = eTable .select ("tr" );
145167 for (Element tr :trs ){
146168 Elements tds = tr .select ("td" );
@@ -151,14 +173,56 @@ private static Map<String,String> fetchInfo(String url){
151173 result .put (tds .get (0 ).text (),tds .get (1 ).text ());
152174 }
153175 }
154- return result ;
176+ result .put ("hscode" ,hscode8 ); // 填充hscode
177+ // System.out.println("=======hscode8 detail:"+JSON.toJSONString(result));
178+ }
179+ }
180+ return result ;
181+ }
182+
183+ // 写文件
184+ private static void writeToFile (Map <String ,String > result ){
185+ String fileName = "D:\\ hscodeTH.txt" ;
186+ BufferedWriter out = null ;
187+ try {
188+ OutputStreamWriter ow = new OutputStreamWriter (new FileOutputStream (new File (fileName ),true ));
189+ out = new BufferedWriter (ow );
190+ out .newLine ();
191+ if (StringUtils .isBlank (result .get ("ASEAN - China (ACFTA)" ))){
192+ out .write (result .get ("hscode" )+";" +result .get ("Favour" )+";Na" +";" +result .get ("desc" ));
155193 }else {
156- continue ;
194+ out . write ( result . get ( "hscode" )+ ";" + result . get ( "Favour" )+ ";" + result . get ( "ASEAN - China (ACFTA)" )+ ";" + result . get ( "desc" )) ;
157195 }
196+ out .flush ();
197+ out .close ();
198+ } catch (Exception e ) {
199+ }finally {
158200 }
159- return null ;
160201 }
161202
203+ // 写文件
204+ private static void writeToFile (List <Map <String ,String >> resultList ){
205+ String fileName = "D:\\ hscodeTH.txt" ;
206+ BufferedWriter out = null ;
207+ try {
208+ OutputStreamWriter ow = new OutputStreamWriter (new FileOutputStream (new File (fileName ),true ),"GBK" );
209+ out = new BufferedWriter (ow );
210+ for (Map <String ,String > result : resultList ){
211+ out .newLine ();
212+ if (StringUtils .isBlank (result .get ("ASEAN - China (ACFTA)" ))){
213+ out .write (result .get ("hscode" )+";" +result .get ("Favour" )+";Na" +";" +result .get ("desc" ));
214+ }else {
215+ out .write (result .get ("hscode" )+";" +result .get ("Favour" )+";" +result .get ("ASEAN - China (ACFTA)" )+";" +result .get ("desc" ));
216+ }
217+ }
218+ out .flush ();
219+ out .close ();
220+ } catch (Exception e ) {
221+ }finally {
222+ }
223+ }
224+
225+ // 发起get请求
162226 private static String doGet (String url ){
163227 HttpClient client = HttpClients .createDefault ();
164228 HttpGet request = new HttpGet ();
@@ -180,5 +244,4 @@ private static String doGet(String url){
180244 return null ;
181245 }
182246
183-
184247}
0 commit comments