-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathconfig.yaml.example
More file actions
231 lines (203 loc) · 8.44 KB
/
config.yaml.example
File metadata and controls
231 lines (203 loc) · 8.44 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
# CSV to OpenSearch Indexer Configuration
# ========================================
# This is an example configuration file. Copy to config.yaml and modify as needed.
# Application settings
app:
name: "CSV to OpenSearch Indexer"
version: "1.0.0"
log_level: "info" # Options: debug, info, warn, error
workers: 4 # Number of parallel workers for processing
# Input settings
input:
csv:
delimiter: "," # CSV delimiter character
skip_empty_lines: true # Skip empty lines in CSV
trim_spaces: true # Trim spaces from field values
encoding: "utf-8" # File encoding
max_field_size: 131072 # Maximum field size in bytes
# Output settings
output:
xml:
directory: "./output" # Directory for XML files
pretty_print: true # Format XML with indentation
include_header: true # Include XML declaration
root_element: "record" # Root element name for XML
encoding: "UTF-8" # XML encoding
create_subdirs: false # Create subdirectories by date
file_prefix: "record_" # Prefix for generated XML files
# Tika settings
tika:
enabled: true # Enable/disable Tika parsing
server_url: "http://localhost:9998"
timeout_seconds: 30 # Request timeout
retry_attempts: 3 # Number of retry attempts
retry_delay_seconds: 2 # Delay between retries
parse_metadata: true # Extract metadata from documents
detect_language: false # Enable language detection
ocr_enabled: false # Enable OCR for images/PDFs
# OpenSearch settings
opensearch:
addresses:
- "http://localhost:9200" # Can specify multiple nodes
username: "" # Leave empty for no auth
password: "" # Leave empty for no auth
use_https: false # Use HTTPS connection
verify_certificates: false # Verify SSL certificates
# Index settings
index:
name: "csv-documents" # Default index name
number_of_shards: 1 # Number of primary shards
number_of_replicas: 0 # Number of replica shards
refresh_interval: "1s" # Index refresh interval
# Index mappings
mappings:
dynamic: true # Allow dynamic field mapping
date_detection: true # Auto-detect date fields
numeric_detection: true # Auto-detect numeric fields
# Indexing settings
indexing:
bulk_size: 100 # Documents per bulk request
bulk_flush_interval: "5s" # Maximum time between bulk flushes
use_bulk: true # Use bulk API for indexing
refresh: true # Refresh index after indexing
pipeline: "" # Ingest pipeline name (optional)
routing: "" # Document routing value (optional)
# Connection pool settings
connection:
max_retries: 3 # Maximum retry attempts
connection_timeout: 10 # Connection timeout in seconds
request_timeout: 30 # Request timeout in seconds
keepalive: true # Use keepalive connections
compression: false # Enable gzip compression
# Processing settings
processing:
batch_size: 50 # Records to process in batch
parallel_processing: true # Enable parallel processing
max_goroutines: 10 # Maximum concurrent goroutines
memory_limit_mb: 512 # Memory limit for processing
# Field processing
fields:
lowercase_headers: false # Convert headers to lowercase
sanitize_field_names: true # Sanitize field names for XML
preserve_empty_fields: false # Keep empty fields in output
max_field_length: 0 # Maximum field length (0 = unlimited)
# Error handling
errors:
continue_on_error: true # Continue processing on errors
max_errors: 100 # Maximum errors before stopping
log_errors: true # Log errors to file
error_log_file: "errors.log" # Error log file path
# Monitoring settings
monitoring:
enabled: false # Enable monitoring endpoints
port: 8080 # Monitoring server port
metrics_path: "/metrics" # Metrics endpoint path
health_path: "/health" # Health check endpoint path
# Docker settings (for docker-compose integration)
docker:
compose_file: "docker-compose.yml"
services:
opensearch:
container_name: "opensearch"
image: "opensearchproject/opensearch:2.11.0"
memory_limit: "2g"
tika:
container_name: "tika-server"
image: "apache/tika:2.9.1.0"
memory_limit: "1g"
dashboards:
container_name: "opensearch-dashboards"
image: "opensearchproject/opensearch-dashboards:2.11.0"
memory_limit: "1g"
# Advanced settings
advanced:
# CSV parsing
csv:
comment_character: "#" # Comment character in CSV
lazy_quotes: false # Allow lazy quotes
fields_per_record: 0 # Expected fields per record (0 = variable)
# XML generation
xml:
namespace: "" # XML namespace URI
namespace_prefix: "" # XML namespace prefix
schema_location: "" # XML schema location
dtd_location: "" # DTD location
# OpenSearch
opensearch:
scroll_size: 1000 # Documents per scroll request
scroll_timeout: "1m" # Scroll context timeout
term_vector: false # Store term vectors
store_source: true # Store document source
# Performance tuning
performance:
gc_percent: 100 # Go garbage collection percentage
max_procs: 0 # GOMAXPROCS (0 = use all CPUs)
buffer_size: 4096 # I/O buffer size in bytes
# Logging settings
logging:
level: "info" # Log level
format: "json" # Log format: json or text
output: "stdout" # Output: stdout, stderr, or file path
file:
path: "logs/app.log" # Log file path
max_size_mb: 100 # Maximum file size before rotation
max_backups: 5 # Maximum number of backup files
max_age_days: 30 # Maximum age of log files
compress: true # Compress rotated files
# Development settings
development:
debug: false # Enable debug mode
verbose: false # Enable verbose output
dry_run: false # Run without actual indexing
sample_size: 0 # Process only N records (0 = all)
skip_tika: false # Skip Tika parsing
skip_indexing: false # Skip OpenSearch indexing
preserve_xml: true # Keep generated XML files
# Feature flags
features:
auto_create_index: true # Auto-create index if missing
auto_detect_types: true # Auto-detect field types
deduplicate: false # Remove duplicate records
validate_xml: false # Validate generated XML
compress_xml: false # Compress XML files
cache_parsed_docs: false # Cache Tika parsed documents
# Notification settings (optional)
notifications:
enabled: false
# Email notifications
email:
enabled: false
smtp_host: "smtp.example.com"
smtp_port: 587
username: ""
password: ""
from: "[email protected]"
to: ["[email protected]"]
on_success: false
on_error: true
# Webhook notifications
webhook:
enabled: false
url: "https://hooks.example.com/webhook"
method: "POST"
headers:
Content-Type: "application/json"
Authorization: "Bearer token"
on_success: true
on_error: true
# Security settings
security:
api_key_enabled: false # Enable API key authentication
api_key: "" # API key value
tls:
enabled: false # Enable TLS
cert_file: "" # TLS certificate file
key_file: "" # TLS key file
ca_file: "" # CA certificate file
# Scheduled processing (cron-like)
schedule:
enabled: false # Enable scheduled processing
cron: "0 2 * * *" # Cron expression (daily at 2 AM)
input_directory: "./data" # Directory to watch for CSV files
archive_directory: "./archive" # Move processed files here
error_directory: "./failed" # Move failed files here