-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpython_xml_parsing.py
More file actions
33 lines (27 loc) · 11.7 KB
/
python_xml_parsing.py
File metadata and controls
33 lines (27 loc) · 11.7 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
# -*- coding: utf-8 -*-
__author__ = 'k22li'
from xml.etree import ElementTree
raw_xml = """ <!doctype html><html itemscope="" itemtype="http://schema.org/WebPage"><head><meta itemprop="image" content="/images/google_favicon_128.png"><title>Google</title><script>(function(){
window.google={kEI:"56KkUrzuEenIiAe2rYGwCA",getEI:function(a){for(var b;a&&(!a.getAttribute||!(b=a.getAttribute("eid")));)a=a.parentNode;return b||google.kEI},https:function(){return"https:"==window.location.protocol},kEXPI:"17259,146363,148115,4000116,4005663,4007278,4007661,4007714,4007830,4008067,4008133,4008142,4009033,4009352,4009565,4009641,4010806,4010858,4010899,4011053,4011063,4011228,4011258,4011524,4011863,4011971,4012001,4012096,4012141,4012145,4012190,4012275,4012302,4012316,4012318,4012320,4012342,4012344,4012365,4012414,4012511,4012526,4012534,4012547,4012585,4012629,4012632,4012691,4012836,4012851,4012869,4012916,4013140,4013146,4013312,4013314,4013363,4013374,4013378,4013416,4013474,4013513,4013551,4013567,4013631,4013634,4013667,4013668,4013671,4013672,4013703,4013711,4013723,4013725,4013741,4013842,4013844,4013854",kCSI:{e:"17259,146363,148115,4000116,4005663,4007278,4007661,4007714,4007830,4008067,4008133,4008142,4009033,4009352,4009565,4009641,4010806,4010858,4010899,4011053,4011063,4011228,4011258,4011524,4011863,4011971,4012001,4012096,4012141,4012145,4012190,4012275,4012302,4012316,4012318,4012320,4012342,4012344,4012365,4012414,4012511,4012526,4012534,4012547,4012585,4012629,4012632,4012691,4012836,4012851,4012869,4012916,4013140,4013146,4013312,4013314,4013363,4013374,4013378,4013416,4013474,4013513,4013551,4013567,4013631,4013634,4013667,4013668,4013671,4013672,4013703,4013711,4013723,4013725,4013741,4013842,4013844,4013854",ei:"56KkUrzuEenIiAe2rYGwCA"},authuser:0,ml:function(){},kHL:"zh-CN",time:function(){return(new Date).getTime()},log:function(a,b,c,l,k){var d=new Image,f=google.lc,e=google.li,g="",h="gen_204";k&&(h=
k);d.onerror=d.onload=d.onabort=function(){delete f[e]};f[e]=d;c||-1!=b.search("&ei=")||(g="&ei="+google.getEI(l));c=c||"/"+h+"?atyp=i&ct="+a+"&cad="+b+g+"&zx="+google.time();
a=/^http:/i;a.test(c)&&google.https()?(google.ml(Error("GLMM"),!1,{src:c}),delete f[e]):(d.src=c,google.li=e+1)},lc:[],li:0,Toolbelt:{},y:{},x:function(a,b){google.y[a.id]=[a,b];return!1},load:function(a,b,c){google.x({id:a+
m++},function(){google.load(a,b,c)})}};var m=0;})();
(function(){google.sn="webhp";google.timers={};google.startTick=function(a,b){google.timers[a]={t:{start:google.time()},bfr:!!b}};google.tick=function(a,b,g){google.timers[a]||google.startTick(a);google.timers[a].t[b]=g||google.time()};google.startTick("load",!0);
try{}catch(d){}})();
var _gjwl=location;function _gjuc(){var a=_gjwl.href.indexOf("#");if(0<=a&&(a=_gjwl.href.substring(a),0<a.indexOf("&q=")||0<=a.indexOf("#q="))&&(a=a.substring(1),-1==a.indexOf("#"))){for(var d=0;d<a.length;){var b=d;"&"==a.charAt(b)&&++b;var c=a.indexOf("&",b);-1==c&&(c=a.length);b=a.substring(b,c);if(0==b.indexOf("fp="))a=a.substring(0,d)+a.substring(c,a.length),c=d;else if("cad=h"==b)return 0;d=c}_gjwl.href="/search?"+a+"&cad=h";return 1}return 0}
function _gjh(){!_gjuc()&&window.google&&google.x&&google.x({id:"GJH"},function(){google.nav&&google.nav.gjh&&google.nav.gjh()})};
window._gjh&&_gjh();</script><style>#gbar,#guser{font-size:13px;padding-top:1px !important;}#gbar{height:22px}#guser{padding-bottom:7px !important;text-align:right}.gbh,.gbd{border-top:1px solid #c9d7f1;font-size:1px}.gbh{height:0;position:absolute;top:24px;width:100%}@media all{.gb1{height:22px;margin-right:.5em;vertical-align:top}#gbar{float:left}}a.gb1,a.gb4{text-decoration:underline !important}a.gb1,a.gb4{color:#00c !important}.gbi .gb4{color:#dd8e27 !important}.gbf .gb4{color:#900 !important}</style><style>body,td,a,p,.h{font-family:arial,sans-serif}body{margin:0;overflow-y:scroll}#gog{padding:3px 8px 0}td{line-height:.8em}.gac_m td{line-height:17px}form{margin-bottom:20px}.h{color:#36c}.q{color:#00c}.ts td{padding:0}.ts{border-collapse:collapse}em{color:#c03;font-style:normal;font-weight:normal}a em{text-decoration:underline}.lst{height:25px;width:496px}.gsfi,.lst{font:18px arial,sans-serif}.gsfs{font:17px arial,sans-serif}.ds{display:inline-box;display:inline-block;margin:3px 0 4px;margin-left:4px}input{font-family:inherit}a.gb1,a.gb2,a.gb3,a.gb4{color:#11c !important}body{background:#fff;color:black}a{color:#11c;text-decoration:none}a:hover,a:active{text-decoration:underline}.fl a{color:#36c}a:visited{color:#551a8b}a.gb1,a.gb4{text-decoration:underline}a.gb3:hover{text-decoration:none}#ghead a.gb2:hover{color:#fff !important}.sblc{padding-top:5px}.sblc a{display:block;margin:2px 0;margin-left:13px;font-size:11px}.lsbb{background:#eee;border:solid 1px;border-color:#ccc #999 #999 #ccc;height:30px}.lsbb{display:block}.ftl,#fll a{display:inline-block;margin:0 12px}.lsb{background:url(/images/srpr/nav_logo80.png) 0 -258px repeat-x;border:none;color:#000;cursor:pointer;height:30px;margin:0;outline:0;font:15px arial,sans-serif;vertical-align:top}.lsb:active{background:#ccc}.lst:focus{outline:none}#addlang a{padding:0 3px}</style><script></script> </head><body bgcolor="#fff"><script>(function(){var src='/images/srpr/nav_logo80.png';var iesg=false;document.body.onload = function(){window.n && window.n();if (document.images){new Image().src=src;}
if (!iesg){document.f&&document.f.q.focus();document.gbqf&&document.gbqf.q.focus();}
}
})();</script><textarea id="csi" style="display:none"></textarea><div id="mngb"><div id=gbar><nobr><b class=gb1>搜索</b> <a class=gb1 href="http://www.google.com.hk/imghp?hl=zh-CN&tab=wi">图片</a> <a class=gb1 href="http://ditu.google.cn/maps?hl=zh-CN&tab=wl">地图</a> <a class=gb1 href="https://play.google.com/?hl=zh-CN&tab=w8">Play</a> <a class=gb1 href="http://www.youtube.com/?gl=HK&tab=w1">YouTube</a> <a class=gb1 href="http://news.google.com.hk/nwshp?hl=zh-CN&tab=wn">新闻</a> <a class=gb1 href="https://mail.google.com/mail/?tab=wm">Gmail</a> <a class=gb1 href="https://drive.google.com/?tab=wo">云端硬盘</a> <a class=gb1 style="text-decoration:none" href="http://www.google.com.hk/intl/zh-CN/options/"><u>更多</u> »</a></nobr></div><div id=guser width=100%><nobr><span id=gbn class=gbi></span><span id=gbf class=gbf></span><span id=gbe></span><a href="http://www.google.com.hk/history/optout?hl=zh-CN" class=gb4>网络历史记录</a> | <a href="/preferences?hl=zh-CN" class=gb4>设置</a> | <a target=_top id=gb_70 href="https://accounts.google.com/ServiceLogin?hl=zh-CN&continue=http://www.google.com.hk/" class=gb4>登录</a></nobr></div><div class=gbh style=left:0></div><div class=gbh style=right:0></div></div><center><br clear="all" id="lgpd"><div id="lga"><a href="/search?newwindow=1&safe=strict&site=&ie=UTF-8&q=Grace+Hopper&oi=ddle&ct=grace-hoppers-107th-birthday-5447077240766464&hl=zh-TW"><img alt="商用��程式�言 COBOL 之母葛��・霍普107��辰" border="0" height="200" src="/logos/doodles/2013/grace-hoppers-107th-birthday-5447077240766464.2-hp.gif" title="商用��程式�言 COBOL 之母葛��・霍普107��辰" width="517" id="hplogo" onload="window.lol&&lol()"><br></a><br></div><form action="/search" name="f"><table cellpadding="0" cellspacing="0"><tr valign="top"><td width="25%"> </td><td align="center" nowrap=""><input name="ie" value="GB2312" type="hidden"><input value="zh-CN" name="hl" type="hidden"><input name="source" type="hidden" value="hp"><div class="ds" style="height:32px;margin:4px 0"><input autocomplete="off" class="lst" value="" title="Google 搜索" maxlength="2048" name="q" size="57" style="color:#000;margin:0;padding:5px 8px 0 6px;vertical-align:top"></div><br style="line-height:0"><span class="ds"><span class="lsbb"><input class="lsb" value="Google 搜索" name="btnG" type="submit"></span></span><span class="ds"><span class="lsbb"><input class="lsb" value=" 手气不错 " name="btnI" type="submit" onclick="if(this.form.q.value)this.checked=1; else top.location='/doodles/'"></span></span></td><td class="fl sblc" align="left" nowrap="" width="25%"><a href="/advanced_search?hl=zh-CN&authuser=0">高级搜索</a><a href="/language_tools?hl=zh-CN&authuser=0">语言工具</a></td></tr></table><input type="hidden" id="gbv" name="gbv" value="1"></form><div id="gac_scont"></div><div style="font-size:83%;min-height:3.5em"><br><div id=als><font size=-1 id=addlang>Google.com.hk 使用下列语言: <a href="http://www.google.com.hk/setprefs?sig=0_3kYku_CIBSSkuIbC4nSt5ExW7dk%3D&hl=zh-TW&source=homepage">中文(繁�)</a> <a href="http://www.google.com.hk/setprefs?sig=0_3kYku_CIBSSkuIbC4nSt5ExW7dk%3D&hl=en&source=homepage">English</a></font><br><br></div></div><span id="footer"><div style="font-size:10pt"><div id="fll" style="margin:19px auto;text-align:center"><a href="/intl/zh-CN/ads/">加入营销计划</a><a href="/intl/zh-CN/about.html">Google 大全</a><a href="http://www.google.com.hk/setprefdomain?prefdom=US&sig=0_XK6YClW-CDCzO0fmA7NCMptmg18%3D" id="fehl">Google.com</a></div></div><p style="color:#767676;font-size:8pt">© 2013 - <a href="/intl/zh-CN/policies/">隐私权和使用条款</a></p></span></center><div id=xjsd></div><div id=xjsi data-jiis="bp"><script>if(google.y)google.y.first=[];(function(){function b(a){window.setTimeout(function(){var c=document.createElement("script");c.src=a;document.getElementById("xjsd").appendChild(c)},0)}google.dljp=function(a){google.xjsu=a;b(a)};google.dlj=b;})();
if(!google.xjs){window._=window._||{};window._._DumpException=function(e){throw e};if(google.timers&&google.timers.load.t){google.timers.load.t.xjsls=new Date().getTime();}google.dljp('/xjs/_/js/k\x3dxjs.hp.en_US.A5v1tjnVtNI.O/m\x3dsb_he,pcc/rt\x3dj/d\x3d1/sv\x3d1/rs\x3dAItRSTNmtBogoe71kCve1XXbKYbv7Nr5UA');google.xjs=1;}google.pmc={"sb":{"agen":true,"cgen":true,"client":"heirloom-hp","dh":true,"ds":"","eqch":true,"fl":true,"host":"google.com.hk","msgs":{"dym":"您是不是要找:","lcky":"\u0026nbsp;手气不错\u0026nbsp;","lml":"了解详情","oskt":"输入工具","psrc":"该搜索已从您的\u003Ca href=\"/history\"\u003E网络历史记录\u003C/a\u003E中删除","psrl":"删除","sbit":"按图片搜索","srch":"Google 搜索"},"ovr":{},"pq":"","qcpw":false,"scd":10,"sce":5,"stok":"gaYuaP7JTM_ZpmT2ykQIxvnrUy8"},"hp":{},"pcc":{}};google.y.first.push(function(){if(google.med){google.med('init');google.initHistory();google.med('history');}});if(google.j&&google.j.en&&google.j.xi){window.setTimeout(google.j.xi,0);}</script></div><script>(function(){var b,c,d,e;function g(a,f){a.removeEventListener?(a.removeEventListener("load",f,!1),a.removeEventListener("error",f,!1)):(a.detachEvent("onload",f),a.detachEvent("onerror",f))}function h(a){e=(new Date).getTime();++c;a=a||window.event;a=a.target||a.srcElement;g(a,h)}var k=document.getElementsByTagName("img");b=k.length;
for(var l=c=0,m;l<b;++l)m=k[l],m.complete||"string"!=typeof m.src||!m.src?++c:m.addEventListener?(m.addEventListener("load",h,!1),m.addEventListener("error",h,!1)):(m.attachEvent("onload",h),m.attachEvent("onerror",h));d=b-c;
function n(){if(google.timers.load.t){google.timers.load.t.ol=(new Date).getTime();google.timers.load.t.iml=e;google.kCSI.imc=c;google.kCSI.imn=b;google.kCSI.imp=d;void 0!==google.stt&&(google.kCSI.stt=google.stt);google.csiReport&&google.csiReport()}}window.addEventListener?window.addEventListener("load",n,!1):window.attachEvent&&window.attachEvent("onload",n);google.timers.load.t.prt=e=(new Date).getTime();})();
</script></body></html>"""
if not isinstance(raw_xml, unicode):
new_xml = raw_xml.decode('utf-8')
print isinstance(new_xml, unicode)
print new_xml
else:
print 'already unicode formats in using!'
tree = ElementTree.fromstring(new_xml)