|
28 | 28 | " * list 用法\n", |
29 | 29 | " * dict 用法\n", |
30 | 30 | " * tuple 用法\n", |
31 | | - "* 随机数介绍" |
| 31 | + "* 随机数介绍\n", |
| 32 | + "* 举例\n", |
| 33 | + " * 中文分词介绍\n", |
| 34 | + " * 小程序联系" |
32 | 35 | ] |
33 | 36 | }, |
34 | 37 | { |
|
1064 | 1067 | }, |
1065 | 1068 | { |
1066 | 1069 | "cell_type": "code", |
1067 | | - "execution_count": 22, |
| 1070 | + "execution_count": 34, |
1068 | 1071 | "metadata": { |
1069 | 1072 | "collapsed": false |
1070 | 1073 | }, |
1071 | 1074 | "outputs": [ |
1072 | | - { |
1073 | | - "name": "stderr", |
1074 | | - "output_type": "stream", |
1075 | | - "text": [ |
1076 | | - "Building prefix dict from the default dictionary ...\n", |
1077 | | - "Dumping model to file cache /var/folders/j8/7nj196c56plf7tg3rtjxjwhr0000gn/T/jieba.cache\n", |
1078 | | - "Loading model cost 2.197 seconds.\n", |
1079 | | - "Prefix dict has been built succesfully.\n" |
1080 | | - ] |
1081 | | - }, |
1082 | 1075 | { |
1083 | 1076 | "name": "stdout", |
1084 | 1077 | "output_type": "stream", |
1085 | 1078 | "text": [ |
1086 | | - "Full Mode: 今天/ 上海/ 的/ 天气/ 怎么样\n", |
| 1079 | + "Full Mode: 今天/ 天上/ 上海/ 的/ 天气/ 怎么/ 怎么样\n", |
1087 | 1080 | "Default Mode: 明天/ 纽约/ 下雨/ 么\n", |
1088 | 1081 | "现在, 天气, 怎么样\n", |
1089 | | - "2016, 年, 第一季度, 支付, 事业部, 交易量, 报表\n", |
| 1082 | + "小明, 硕士, 毕业, 于, 中国科学院, 计算所, ,, 后, 在, 日本京都大学, 深造\n", |
1090 | 1083 | "小明, 硕士, 毕业, 于, 中国, 科学, 学院, 科学院, 中国科学院, 计算, 计算所, ,, 后, 在, 日本, 京都, 大学, 日本京都大学, 深造\n" |
1091 | 1084 | ] |
1092 | 1085 | } |
1093 | 1086 | ], |
1094 | 1087 | "source": [ |
1095 | 1088 | "import jieba\n", |
1096 | 1089 | "\n", |
1097 | | - "seg_list = jieba.cut(\"今天上海的天气怎么样\", cut_all=False)\n", |
1098 | | - "print(\"Full Mode: \" + \"/ \".join(seg_list)) # 全模式\n", |
| 1090 | + "# 全模式\n", |
| 1091 | + "# 把句子中所有的可以称此的词语都扫描出来,速度非常快,但是不能解决歧义\n", |
| 1092 | + "seg_list = jieba.cut(\"今天上海的天气怎么样\", cut_all = True)\n", |
| 1093 | + "print(\"Full Mode: \" + \"/ \".join(seg_list)) \n", |
1099 | 1094 | "\n", |
1100 | | - "seg_list = jieba.cut(\"明天纽约下雨么\", cut_all=False)\n", |
1101 | | - "print(\"Default Mode: \" + \"/ \".join(seg_list)) # 精确模式\n", |
| 1095 | + "# 精确模式\n", |
| 1096 | + "# 试图将句子最精确的切开,适合文本分析\n", |
| 1097 | + "seg_list = jieba.cut(\"明天纽约下雨么\", cut_all = False)\n", |
| 1098 | + "print(\"Default Mode: \" + \"/ \".join(seg_list)) \n", |
1102 | 1099 | "\n", |
1103 | | - "seg_list = jieba.cut(\"现在天气怎么样\") # 默认是精确模式\n", |
| 1100 | + "# 默认是精确模式\n", |
| 1101 | + "seg_list = jieba.cut(\"现在天气怎么样\") \n", |
1104 | 1102 | "print(\", \".join(seg_list))\n", |
1105 | 1103 | "\n", |
1106 | | - "seg_list = jieba.cut(\"2016年第一季度支付事业部交易量报表\") # 默认是精确模式\n", |
| 1104 | + "# 默认是精确模式\n", |
| 1105 | + "seg_list = jieba.cut(\"小明硕士毕业于中国科学院计算所,后在日本京都大学深造\") \n", |
1107 | 1106 | "print(\", \".join(seg_list))\n", |
1108 | 1107 | "\n", |
1109 | | - "seg_list = jieba.cut_for_search(\"小明硕士毕业于中国科学院计算所,后在日本京都大学深造\") # 搜索引擎模式\n", |
| 1108 | + "# 搜索引擎模式\n", |
| 1109 | + "# 在精确模式的基础上,对长词再次切分,提高召回率,适合用于搜索引擎分词 \n", |
| 1110 | + "seg_list = jieba.cut_for_search(\"小明硕士毕业于中国科学院计算所,后在日本京都大学深造\") \n", |
1110 | 1111 | "print(\", \".join(seg_list))" |
1111 | 1112 | ] |
1112 | 1113 | }, |
|
1248 | 1249 | "name": "python", |
1249 | 1250 | "nbconvert_exporter": "python", |
1250 | 1251 | "pygments_lexer": "ipython3", |
1251 | | - "version": "3.5.1" |
| 1252 | + "version": "3.4.4" |
1252 | 1253 | } |
1253 | 1254 | }, |
1254 | 1255 | "nbformat": 4, |
|
0 commit comments