search/config.yaml.example at main · rcrupp/search

History

231 lines (203 loc) · 8.44 KB

Raw

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

# CSV to OpenSearch Indexer Configuration

# ========================================

# This is an example configuration file. Copy to config.yaml and modify as needed.

# Application settings

app:

name: "CSV to OpenSearch Indexer"

version: "1.0.0"

log_level: "info" # Options: debug, info, warn, error

workers: 4 # Number of parallel workers for processing

# Input settings

input:

csv:

delimiter: "," # CSV delimiter character

skip_empty_lines: true # Skip empty lines in CSV

trim_spaces: true # Trim spaces from field values

encoding: "utf-8" # File encoding

max_field_size: 131072 # Maximum field size in bytes

# Output settings

output:

xml:

directory: "./output" # Directory for XML files

pretty_print: true # Format XML with indentation

include_header: true # Include XML declaration

root_element: "record" # Root element name for XML

encoding: "UTF-8" # XML encoding

create_subdirs: false # Create subdirectories by date

file_prefix: "record_" # Prefix for generated XML files

# Tika settings

tika:

enabled: true # Enable/disable Tika parsing

server_url: "http://localhost:9998"

timeout_seconds: 30 # Request timeout

retry_attempts: 3 # Number of retry attempts

retry_delay_seconds: 2 # Delay between retries

parse_metadata: true # Extract metadata from documents

detect_language: false # Enable language detection

ocr_enabled: false # Enable OCR for images/PDFs

# OpenSearch settings

opensearch:

addresses:

- "http://localhost:9200" # Can specify multiple nodes

username: "" # Leave empty for no auth

password: "" # Leave empty for no auth

use_https: false # Use HTTPS connection

verify_certificates: false # Verify SSL certificates

# Index settings

index:

name: "csv-documents" # Default index name

number_of_shards: 1 # Number of primary shards

number_of_replicas: 0 # Number of replica shards

refresh_interval: "1s" # Index refresh interval

# Index mappings

mappings:

dynamic: true # Allow dynamic field mapping

date_detection: true # Auto-detect date fields

numeric_detection: true # Auto-detect numeric fields

# Indexing settings

indexing:

bulk_size: 100 # Documents per bulk request

bulk_flush_interval: "5s" # Maximum time between bulk flushes

use_bulk: true # Use bulk API for indexing

refresh: true # Refresh index after indexing

pipeline: "" # Ingest pipeline name (optional)

routing: "" # Document routing value (optional)

# Connection pool settings

connection:

max_retries: 3 # Maximum retry attempts

connection_timeout: 10 # Connection timeout in seconds

request_timeout: 30 # Request timeout in seconds

keepalive: true # Use keepalive connections

compression: false # Enable gzip compression

# Processing settings

processing:

batch_size: 50 # Records to process in batch

parallel_processing: true # Enable parallel processing

max_goroutines: 10 # Maximum concurrent goroutines

memory_limit_mb: 512 # Memory limit for processing

# Field processing

fields:

lowercase_headers: false # Convert headers to lowercase

sanitize_field_names: true # Sanitize field names for XML

preserve_empty_fields: false # Keep empty fields in output

max_field_length: 0 # Maximum field length (0 = unlimited)

# Error handling

errors:

continue_on_error: true # Continue processing on errors

max_errors: 100 # Maximum errors before stopping

log_errors: true # Log errors to file

error_log_file: "errors.log" # Error log file path

# Monitoring settings

monitoring:

enabled: false # Enable monitoring endpoints

port: 8080 # Monitoring server port

metrics_path: "/metrics" # Metrics endpoint path

health_path: "/health" # Health check endpoint path

# Docker settings (for docker-compose integration)

docker:

compose_file: "docker-compose.yml"

services:

opensearch:

container_name: "opensearch"

image: "opensearchproject/opensearch:2.11.0"

memory_limit: "2g"

tika:

container_name: "tika-server"

image: "apache/tika:2.9.1.0"

memory_limit: "1g"

dashboards:

container_name: "opensearch-dashboards"

image: "opensearchproject/opensearch-dashboards:2.11.0"

memory_limit: "1g"

# Advanced settings

advanced:

# CSV parsing

csv:

comment_character: "#" # Comment character in CSV

lazy_quotes: false # Allow lazy quotes

fields_per_record: 0 # Expected fields per record (0 = variable)

# XML generation

xml:

namespace: "" # XML namespace URI

namespace_prefix: "" # XML namespace prefix

schema_location: "" # XML schema location

dtd_location: "" # DTD location

# OpenSearch

opensearch:

scroll_size: 1000 # Documents per scroll request

scroll_timeout: "1m" # Scroll context timeout

term_vector: false # Store term vectors

store_source: true # Store document source

# Performance tuning

performance:

gc_percent: 100 # Go garbage collection percentage

max_procs: 0 # GOMAXPROCS (0 = use all CPUs)

buffer_size: 4096 # I/O buffer size in bytes

# Logging settings

logging:

level: "info" # Log level

format: "json" # Log format: json or text

output: "stdout" # Output: stdout, stderr, or file path

file:

path: "logs/app.log" # Log file path

max_size_mb: 100 # Maximum file size before rotation

max_backups: 5 # Maximum number of backup files

max_age_days: 30 # Maximum age of log files

compress: true # Compress rotated files

# Development settings

development:

debug: false # Enable debug mode

verbose: false # Enable verbose output

dry_run: false # Run without actual indexing

sample_size: 0 # Process only N records (0 = all)

skip_tika: false # Skip Tika parsing

skip_indexing: false # Skip OpenSearch indexing

preserve_xml: true # Keep generated XML files

# Feature flags

features:

auto_create_index: true # Auto-create index if missing

auto_detect_types: true # Auto-detect field types

deduplicate: false # Remove duplicate records

validate_xml: false # Validate generated XML

compress_xml: false # Compress XML files

cache_parsed_docs: false # Cache Tika parsed documents

# Notification settings (optional)

notifications:

enabled: false

# Email notifications

email:

enabled: false

smtp_host: "smtp.example.com"

smtp_port: 587

username: ""

password: ""

from: "[email protected]"

to: ["[email protected]"]

on_success: false

on_error: true

# Webhook notifications

webhook:

enabled: false

url: "https://hooks.example.com/webhook"

method: "POST"

headers:

Content-Type: "application/json"

Authorization: "Bearer token"

on_success: true

on_error: true

# Security settings

security:

api_key_enabled: false # Enable API key authentication

api_key: "" # API key value

tls:

enabled: false # Enable TLS

cert_file: "" # TLS certificate file

key_file: "" # TLS key file

ca_file: "" # CA certificate file

# Scheduled processing (cron-like)

schedule:

enabled: false # Enable scheduled processing

cron: "0 2 * * *" # Cron expression (daily at 2 AM)

input_directory: "./data" # Directory to watch for CSV files

archive_directory: "./archive" # Move processed files here

error_directory: "./failed" # Move failed files here

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

FilesExpand file tree

config.yaml.example

Latest commit

History

config.yaml.example

File metadata and controls