从国家统计局爬下来的地区信息

太过爱你忘了你带给我的痛 2022-07-16 07:21 346阅读 0赞

发现地区编码网上流传了很多版本。有很多崇文区,玄武区之类的。于是想了想,还是自己做一份。不敢保证没问题,但还没遇到问题。

首先,从网上找到一个大神写的jsoup的例子,修改成自己想要的格式,在代码无价的年代,原谅我的抄袭,研究是份任重而道远的任务。

1.jsoup代码:

ContractedBlock.gif ExpandedBlockStart.gif

  1. 1 package com.test;
  2. 2
  3. 3 import java.io.BufferedWriter;
  4. 4 import java.io.File;
  5. 5 import java.io.FileWriter;
  6. 6 import java.io.IOException;
  7. 7 import java.util.HashMap;
  8. 8 import java.util.Map;
  9. 9 import java.util.Random;
  10. 10
  11. 11 import org.apache.log4j.Logger;
  12. 12 import org.jsoup.Jsoup;
  13. 13 import org.jsoup.nodes.Document;
  14. 14 import org.jsoup.nodes.Element;
  15. 15 import org.jsoup.select.Elements;
  16. 16 import org.junit.Test;
  17. 17
  18. 18 /**
  19. 19 * 全国省市县镇村数据爬取
  20. 20 *
  21. 21 * @author liushaofeng
  22. 22 * @date 2015-10-11 上午12:19:39
  23. 23 * @version 1.0.0
  24. 24 */
  25. 25 public class JsoupProviceTest {
  26. 26 private static Map<Integer, String> cssMap = new HashMap<Integer, String>();
  27. 27
  28. 28 static {
  29. 29 cssMap.put(1, "provincetr");// 省
  30. 30 cssMap.put(2, "citytr");// 市
  31. 31 cssMap.put(3, "countytr");// 县
  32. 32 cssMap.put(4, "towntr");// 镇
  33. 33 cssMap.put(5, "villagetr");// 村
  34. 34 }
  35. 35
  36. 36 public static void main(String[] args) throws IOException {
  37. 37 int level = 1;
  38. 38 // TestConDataBase.initDataBase();
  39. 39
  40. 40 // 获取全国各个省级信息
  41. 41 Document connect = connect("http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2013/");
  42. 42 Elements rowProvince = connect.select("tr." + cssMap.get(level));
  43. 43 for (Element provinceElement : rowProvince)// 遍历每一行的省份城市
  44. 44 {
  45. 45 Elements select = provinceElement.select("a");
  46. 46 for (Element province : select)// 每一个省份(四川省)
  47. 47 {
  48. 48 String href = province.attr("href");
  49. 49 String procode = href.substring(href.length()-7, href.length()-5)+"0000000000";
  50. 50 System.out.println(level+1+","+procode+","+province.text());
  51. 51 // SysZone zone = new SysZone();
  52. 52 // try {
  53. 53 // zone.set("zoneLevel", level+1)
  54. 54 // .set("zoneCode", procode.trim())
  55. 55 // .set("parentCode", "000000000000")
  56. 56 // .set("zoneName", province.text())
  57. 57 // .save();
  58. 58 // } catch (Exception e1) {
  59. 59 // // TODO Auto-generated catch block
  60. 60 // e1.printStackTrace();
  61. 61 //
  62. 62 // }
  63. 63
  64. 64 parseNextLevel(province, level + 1,procode);
  65. 65 }
  66. 66 }
  67. 67 // for (int i = 3; i < rowProvince.size(); i++) {
  68. 68 // Element provinceElement = rowProvince.get(i);
  69. 69 // Elements select = provinceElement.select("a");
  70. 70 // for (int j = 2; j < select.size(); j++) {
  71. 71 // Element province = select.get(j);
  72. 72 // System.out.println(province.text());
  73. 73 // parseNextLevel(province, level + 1);
  74. 74 // }
  75. 75 // }
  76. 76 }
  77. 77
  78. 78 @Test
  79. 79 public void testa(){
  80. 80 // 获取全国各个省级信息
  81. 81 Document connect = connect("http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2013/");
  82. 82 Elements rowProvince = connect.select("tr." + cssMap.get(1));
  83. 83 for (Element provinceElement : rowProvince)// 遍历每一行的省份城市
  84. 84 {
  85. 85 Elements select = provinceElement.select("a");
  86. 86 for (Element province : select)// 每一个省份(四川省)
  87. 87 {
  88. 88 printProvince(province);
  89. 89 }
  90. 90 }
  91. 91 }
  92. 92
  93. 93 private static void parseNextLevel(Element parentElement, int level, String parent)
  94. 94 throws IOException {
  95. 95 try {
  96. 96 Thread.sleep(500);
  97. 97 } catch (InterruptedException e) {
  98. 98 e.printStackTrace();
  99. 99 }
  100. 100
  101. 101 Document doc = connect(parentElement.attr("abs:href"));
  102. 102 if(doc==null){
  103. 103 doc = connect(parentElement.attr("abs:href"));
  104. 104 }
  105. 105 Elements newsHeadlines = doc.select("tr." + cssMap.get(level));//
  106. 106 // 获取表格的一行数据
  107. 107 for (Element element : newsHeadlines) {
  108. 108 String parents = printInfo(element, level + 1,parent);
  109. 109 Elements select = element.select("a");
  110. 110 if (select.size() != 0) {
  111. 111 parseNextLevel(select.last(), level + 1,parents);
  112. 112 }
  113. 113 }
  114. 114 }
  115. 115
  116. 116 private static void printProvince(Element province){
  117. 117 BufferedWriter bufferedWriter = null;
  118. 118 try {
  119. 119 bufferedWriter = new BufferedWriter(new FileWriter(new File(
  120. 120 "F:\\provinces.txt"), true));
  121. 121 String pro = province.text();
  122. 122 String href = province.attr("href");
  123. 123 String procode = href.substring(href.length()-7, href.length()-5)+"0000000000";
  124. 124 System.out.println(2+","+procode+",000000000000,"+pro);
  125. 125 bufferedWriter.write(2+","+procode+",000000000000,"+pro);
  126. 126 bufferedWriter.newLine();
  127. 127 bufferedWriter.flush();
  128. 128 } catch (IOException e) {
  129. 129 e.printStackTrace();
  130. 130 } finally {
  131. 131 if (bufferedWriter != null) {
  132. 132 try {
  133. 133 bufferedWriter.close();
  134. 134 } catch (IOException e) {
  135. 135 e.printStackTrace();
  136. 136 }
  137. 137 bufferedWriter = null;
  138. 138 }
  139. 139 }
  140. 140 }
  141. 141
  142. 142 private static String printInfo(Element element, int level,String parent) {
  143. 143 BufferedWriter bufferedWriter = null;
  144. 144 String code = "";
  145. 145 code = element.select("td").first().text();
  146. 146 String name =element.select("td").last().text();
  147. 147 String str = level + "," + code + ","+parent+","
  148. 148 + name;
  149. 149
  150. 150 System.out.println(str);
  151. 151 // SysZone zone = new SysZone();
  152. 152 // try {
  153. 153 // zone.set("zoneLevel", level)
  154. 154 // .set("zoneCode", code.trim())
  155. 155 // .set("parentCode", parent.trim())
  156. 156 // .set("zoneName", name.trim())
  157. 157 // .save();
  158. 158 // } catch (Exception e1) {
  159. 159 // // TODO Auto-generated catch block
  160. 160 // e1.printStackTrace();
  161. 161 //
  162. 162 // }
  163. 163 try {
  164. 164 bufferedWriter = new BufferedWriter(new FileWriter(new File(
  165. 165 "F:\\AllCity.txt"), true));
  166. 166
  167. 167 bufferedWriter.write(str);
  168. 168 bufferedWriter.newLine();
  169. 169 bufferedWriter.flush();
  170. 170 } catch (IOException e) {
  171. 171 e.printStackTrace();
  172. 172 } finally {
  173. 173 if (bufferedWriter != null) {
  174. 174 try {
  175. 175 bufferedWriter.close();
  176. 176 } catch (IOException e) {
  177. 177 e.printStackTrace();
  178. 178 }
  179. 179 bufferedWriter = null;
  180. 180 }
  181. 181 }
  182. 182
  183. 183 return code;
  184. 184 }
  185. 185
  186. 186 private static Document connect(String url) {
  187. 187 if (url == null || url.isEmpty()) {
  188. 188 throw new IllegalArgumentException("The input url('" + url
  189. 189 + "') is invalid!");
  190. 190 }
  191. 191 String [] b = {
  192. 192 "Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070309 Firefox/2.0.0.3",
  193. 193 "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
  194. 194 "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
  195. 195 "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
  196. 196 "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
  197. 197 "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
  198. 198 "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
  199. 199 "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
  200. 200 "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
  201. 201 "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
  202. 202 "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
  203. 203 "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
  204. 204 "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
  205. 205 "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
  206. 206 "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
  207. 207 "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
  208. 208 "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
  209. 209 "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
  210. 210 "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
  211. 211 };
  212. 212 try {
  213. 213
  214. 214 Random rand = new Random();
  215. 215 return Jsoup.connect(url)
  216. 216 .header("User-Agent",b[rand.nextInt(19)])
  217. 217 .timeout(90 * 1000).get();
  218. 218 } catch (IOException e) {
  219. 219 e.printStackTrace();
  220. 220 }
  221. 221 return null;
  222. 222 }
  223. 223 }

2.可以选择从文本读取后写入数据库,也可直接写入。

686418-20151106183632367-955581355.png

3.最终,我生成了两份,一个是省市县三级的,一个是所有的。

省市县中去掉了市辖区等无关代码。

4.爬虫源码:包括数据库保存。下载源码

5.省市县t:下载txt, 下载sql

6.所有地区,港澳台除外txt, 下载sql

发表评论

表情:
评论列表 (有 0 条评论,346人围观)

还没有评论,来说两句吧...

相关阅读