forked from watson-developer-cloud/python-sdk
-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathdocument_conversion_v1.py
More file actions
130 lines (122 loc) · 4.09 KB
/
document_conversion_v1.py
File metadata and controls
130 lines (122 loc) · 4.09 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
# coding=utf-8
import json
from os.path import join, dirname
from io import open
from watson_developer_cloud import DocumentConversionV1
document_conversion = DocumentConversionV1(
username='YOUR SERVICE USERNAME',
password='YOUR SERVICE PASSWORD',
version='2016-02-09')
# Example of retrieving html or plain text
with open(join(dirname(__file__), '../resources/example.html'),
encoding='utf8') as document:
config = {'conversion_target': DocumentConversionV1.NORMALIZED_HTML}
print(document_conversion.convert_document(
document=document, config=config, media_type='text/html').content)
# Example with JSON
with open(join(dirname(__file__), '../resources/example.html'),
encoding='utf8') as document:
config['conversion_target'] = DocumentConversionV1.ANSWER_UNITS
print(json.dumps(
document_conversion.convert_document(document=document, config=config),
indent=2))
# Examples of index_document API
print(
"########## Example of a dry run of index_document with only a document "
"##########")
with open(join(dirname(__file__), '../resources/example.html'),
encoding='utf8') as document:
config = {
'retrieve_and_rank': {
'dry_run': 'true'
}
}
print(json.dumps(
document_conversion.index_document(config=config, document=document),
indent=2))
print(
"########## Example of a dry run of index_document with only metadata "
"##########")
config = {
'retrieve_and_rank': {
'dry_run': 'true'
}
}
metadata = {
'metadata': [
{'name': 'id', 'value': '12345'}
]
}
print(
json.dumps(
document_conversion.index_document(config=config, metadata=metadata),
indent=2))
print(
"########## Example of a dry run of index_document with document and "
"metadata "
"##########")
with open(join(dirname(__file__), '../resources/example.html'),
encoding='utf8') as document:
config = {
'retrieve_and_rank': {
'dry_run': 'true'
}
}
metadata = {
'metadata': [
{'name': 'id', 'value': '12345'}
]
}
print(json.dumps(
document_conversion.index_document(config=config, document=document,
metadata=metadata), indent=2))
print(
"########## Example of a dry run of index_document with document, "
"metadata, "
"and additional config for conversion"
"##########")
with open(join(dirname(__file__), '../resources/example.html'),
encoding='utf8') as document:
config = {
'convert_document': {
'normalized_html': {
'exclude_content': {"xpaths": ["//body/div"]}
}
},
'retrieve_and_rank': {
'dry_run': 'true'
}
}
metadata = {
'metadata': [
{'name': 'id', 'value': '12345'}
]
}
print(json.dumps(
document_conversion.index_document(config=config, document=document,
metadata=metadata), indent=2))
# print("########## Example of index_document with document, metadata (A
# service instance id, SOLR cluster id, and "
# "a SOLR collection name must be provided from the Retrieve and Rank
# service in order to index) ##########")
# with open(join(dirname(__file__), '../resources/example.html'), 'r') as
# document:
# config = {
# 'retrieve_and_rank': {
# 'dry_run': 'false',
# 'service_instance_id': 'YOUR RETRIEVE AND RANK SERVICE INSTANCE
# ID',
# 'cluster_id': 'YOUR RETRIEVE AND RANK SERVICE SOLR CLUSTER ID',
# 'search_collection': 'YOUR RETRIEVE AND RANK SERVICE SOLR
# SEARCH COLLECTION NAME'
# }
# }
# metadata = {
# 'metadata': [
# {'name': 'id', 'value': '12345'}
# ]
# }
# print(json.dumps(document_conversion.index_document(config=config,
# document=document,
# metadata=metadata),
# indent=2))