Skip to content

Commit c41368f

Browse files
authored
Add unstructured processor (#6)
1 parent e9f1248 commit c41368f

File tree

2 files changed

+38
-0
lines changed

2 files changed

+38
-0
lines changed

src/DocumentProcessor.php

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,4 +24,11 @@ enum DocumentProcessor: string
2424
* Uses LLamaCloud https://cloud.llamaindex.ai/ as document processor to extract text
2525
*/
2626
case LLAMAPARSE = 'llama';
27+
28+
/**
29+
* The Unstructured processor
30+
*
31+
* Uses Unstructored https://unstructured.io/ as document processor to extract text
32+
*/
33+
case UNSTRUCTURED = 'unstructured';
2734
}

tests/ParseProcessorSelectionTest.php

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -99,3 +99,34 @@
9999

100100
$mockClient->assertSentCount(1);
101101
});
102+
103+
test('unstructured can be selected as processor', function () {
104+
$mockClient = MockClient::global([
105+
ExtractTextRequest::class => MockResponse::fixture('extract-text-empty'),
106+
]);
107+
108+
$connector = new ParseConnector('fake', 'http://localhost:5002');
109+
$connector->withMockClient($mockClient);
110+
111+
$connector->parse(
112+
url: 'http://localhost/empty.pdf',
113+
options: new ParseOption(DocumentProcessor::UNSTRUCTURED),
114+
);
115+
116+
$mockClient->assertSent(ExtractTextRequest::class);
117+
118+
$mockClient->assertSent(function (Request $request, Response $response) {
119+
if (! $request instanceof ExtractTextRequest) {
120+
return false;
121+
}
122+
123+
/** @var array */
124+
$body = $request->body()->all();
125+
126+
return $body['url'] === 'http://localhost/empty.pdf'
127+
&& $body['mime_type'] === 'application/pdf'
128+
&& $body['driver'] === 'unstructured';
129+
});
130+
131+
$mockClient->assertSentCount(1);
132+
});

0 commit comments

Comments
 (0)