-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathmain.py
More file actions
114 lines (102 loc) · 3.39 KB
/
main.py
File metadata and controls
114 lines (102 loc) · 3.39 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
import argparse
from utils.decode import decode_escape
from commands.pdf import start as pdf_start
CHUNK_STRATEGIES = [
"by_separator",
"by_separators",
"by_token_tiktoken",
"by_token_spacy",
"by_token_nltk",
# "by_token_huggingface",
]
def main() -> None:
parser = argparse.ArgumentParser(
description="Crawl data from websites or pdf files, chunk them using different strategies, and store them in .jsonl files.",
)
subparsers = parser.add_subparsers(
dest="command", required=True, help="Available commands"
)
# Subparser for the 'pdf' command
pdf_parser = subparsers.add_parser("pdf", help="Crawl data from PDF files")
pdf_parser.add_argument(
"--file",
action="append",
type=str,
required=True,
help="Path to the PDF file or files to be processed.",
)
pdf_parser.add_argument(
"--chunk-strategy",
choices=["all", *CHUNK_STRATEGIES],
nargs="+",
required=True,
help="Strategy for chunking the PDF content.",
)
pdf_parser.add_argument(
"--chunk-size",
type=int,
default=None,
help="Size of each chunk in 'chunk_strategy' unit.",
)
pdf_parser.add_argument(
"--chunk-overlap",
type=int,
default=None,
help="Overlap size between chunks in 'chunk_strategy' unit.",
)
pdf_parser.add_argument(
"--chunk-separator",
type=decode_escape,
default=[],
nargs="*",
help="Separator(s) to use for chunking. Can be specified multiple times for multiple separators.",
)
pdf_parser.add_argument(
"--output-folder",
type=str,
default="out",
help="Folder to save the output JSONL files.",
)
pdf_parser.add_argument(
"--clean-previous",
action="store_true",
default=False,
help="Clean the output folder before running the command.",
)
pdf_parser.add_argument(
"--verbose", action="store_true", default=False, help="Enable verbose output."
)
pdf_parser.set_defaults(func=pdf_start)
# Parse
args = parser.parse_args()
if args.command == "pdf":
if "all" in args.chunk_strategy:
args.chunk_strategy = CHUNK_STRATEGIES
# Validate pdf flags
if (
args.chunk_size is not None
and args.chunk_overlap is not None
and args.chunk_size < args.chunk_overlap
):
parser.error("Chunk size must be greater than or equal to chunk overlap.")
if (
"by_separator" in args.chunk_strategy
or "by_token_spacy" in args.chunk_strategy
or "by_token_nltk" in args.chunk_strategy
) and len(args.chunk_separator) > 1:
parser.error(
"Only one separator can be used with ('by_separator', 'by_token_nltk' and 'by_token_spacy') strategies. Use 'by_separators' alone for multiple separators."
)
# Dispatch
args.func(
file_paths=args.file,
chunk_strategies=args.chunk_strategy,
chunk_size=args.chunk_size,
chunk_overlap=args.chunk_overlap,
chunk_separators=args.chunk_separator,
output_folder=args.output_folder,
clean_previous_runs=args.clean_previous,
verbose=args.verbose,
)
if __name__ == "__main__":
main()