From cc36ba74a786a643b75ef92d0f49f173ed64490e Mon Sep 17 00:00:00 2001 From: azayz Date: Tue, 4 Apr 2023 17:24:16 +0100 Subject: [PATCH 1/3] docs: add audio2text showcase Signed-off-by: azayz --- docs/tutorial/audio2text/audio2text.md | 77 +++++++++++++++++++++++ docs/tutorial/audio2text/requirements.txt | 1 + 2 files changed, 78 insertions(+) create mode 100644 docs/tutorial/audio2text/audio2text.md create mode 100644 docs/tutorial/audio2text/requirements.txt diff --git a/docs/tutorial/audio2text/audio2text.md b/docs/tutorial/audio2text/audio2text.md new file mode 100644 index 00000000000..fd913785aa1 --- /dev/null +++ b/docs/tutorial/audio2text/audio2text.md @@ -0,0 +1,77 @@ +# Creating an Audio to Text App with Jina and DocArray V2 + +This is how you can build an Audio to Text app using both Jina and DocarrayV2 + +We will use: + +* DocarrayV2: Helps us to load and preprocess multimodal data such as image, text and audio in our case +* Jina: Helps us serve the model quickly and create a client + +First let's install requirements + +## 💾 Installation + +```bash +pip install -r requirments.txt +``` + +Now let's import necessary libraries + + +```python +import whisper +from jina import Executor, requests, Deployment +from docarray import BaseDoc, DocArray +from docarray.typing import AudioUrl +``` + +Now we need to create the schema of our input and output documents. Since our input is an audio +our input schema should contain an AudioUrl like the following + +```python +class AudioURL(BaseDoc): + audio: AudioUrl +``` + +As for the output schema we would like to receive the transcribed text so we use the following: + +```python +class Response(BaseDoc): + text: str +``` + +Now it's time we create our model, we wrap our model into Jina Executor, this allows us to serve to model +later on and expose its endpoint /transcribe + +```python +class WhisperExecutor(Executor): + def __init__(self, device: str, *args, **kwargs): + super().__init__(*args, **kwargs) + self.model = whisper.load_model("medium.en", device=device) + + @requests + def transcribe(self, docs: DocArray[AudioURL], **kwargs) -> DocArray[Response]: + response_docs = DocArray[Response]() + for doc in docs: + transcribed_text = self.model.transcribe(str(doc.audio))['text'] + response_docs.append(Response(text=transcribed_text)) + return response_docs +``` + +Now we can leverage Deployment object provided by Jina to use this executor +then we send a request to transcribe endpoint. Here we are using an audio file previously recorded +that says, "A Man reading a book" saved under resources/audio.mp3 but feel free to use your own audio. + +```python +with Deployment( + uses=WhisperExecutor, uses_with={'device': "cpu"}, port=12349, timeout_ready=-1 +) as d: + docs = d.post( + on='/transcribe', + inputs=[AudioURL(audio='resources/audio.mp3')], + return_type=DocArray[Response], + ) + print(docs[0].text) +``` + +And we get the transcribed result! \ No newline at end of file diff --git a/docs/tutorial/audio2text/requirements.txt b/docs/tutorial/audio2text/requirements.txt new file mode 100644 index 00000000000..03e394c0d5f --- /dev/null +++ b/docs/tutorial/audio2text/requirements.txt @@ -0,0 +1 @@ +openai-whisper==20230308 \ No newline at end of file From ace14239ab867ce8b001b7f333d2e6c28aa1ff2a Mon Sep 17 00:00:00 2001 From: azayz Date: Wed, 5 Apr 2023 08:07:27 +0100 Subject: [PATCH 2/3] refactor: move to how to Signed-off-by: azayz --- docs/{tutorial/audio2text => how_to}/audio2text.md | 12 +++++++----- docs/tutorial/audio2text/requirements.txt | 1 - 2 files changed, 7 insertions(+), 6 deletions(-) rename docs/{tutorial/audio2text => how_to}/audio2text.md (88%) delete mode 100644 docs/tutorial/audio2text/requirements.txt diff --git a/docs/tutorial/audio2text/audio2text.md b/docs/how_to/audio2text.md similarity index 88% rename from docs/tutorial/audio2text/audio2text.md rename to docs/how_to/audio2text.md index fd913785aa1..5d7a03b5efa 100644 --- a/docs/tutorial/audio2text/audio2text.md +++ b/docs/how_to/audio2text.md @@ -12,7 +12,9 @@ First let's install requirements ## 💾 Installation ```bash -pip install -r requirments.txt +pip install transformers +pip install openai-whisper +pip install jina ``` Now let's import necessary libraries @@ -21,7 +23,7 @@ Now let's import necessary libraries ```python import whisper from jina import Executor, requests, Deployment -from docarray import BaseDoc, DocArray +from docarray import BaseDoc, DocList from docarray.typing import AudioUrl ``` @@ -50,8 +52,8 @@ class WhisperExecutor(Executor): self.model = whisper.load_model("medium.en", device=device) @requests - def transcribe(self, docs: DocArray[AudioURL], **kwargs) -> DocArray[Response]: - response_docs = DocArray[Response]() + def transcribe(self, docs: DocList[AudioURL], **kwargs) -> DocList[Response]: + response_docs = DocList[Response]() for doc in docs: transcribed_text = self.model.transcribe(str(doc.audio))['text'] response_docs.append(Response(text=transcribed_text)) @@ -69,7 +71,7 @@ with Deployment( docs = d.post( on='/transcribe', inputs=[AudioURL(audio='resources/audio.mp3')], - return_type=DocArray[Response], + return_type=DocList[Response], ) print(docs[0].text) ``` diff --git a/docs/tutorial/audio2text/requirements.txt b/docs/tutorial/audio2text/requirements.txt deleted file mode 100644 index 03e394c0d5f..00000000000 --- a/docs/tutorial/audio2text/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -openai-whisper==20230308 \ No newline at end of file From 5509cfcf03e0c80a7f6297b86f357631ba8b8017 Mon Sep 17 00:00:00 2001 From: azayz Date: Wed, 5 Apr 2023 09:10:03 +0100 Subject: [PATCH 3/3] fix: add line to mkdocs Signed-off-by: azayz --- docs/how_to/audio2text.md | 2 +- mkdocs.yml | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/how_to/audio2text.md b/docs/how_to/audio2text.md index 5d7a03b5efa..fcec869ce0f 100644 --- a/docs/how_to/audio2text.md +++ b/docs/how_to/audio2text.md @@ -1,6 +1,6 @@ # Creating an Audio to Text App with Jina and DocArray V2 -This is how you can build an Audio to Text app using both Jina and DocarrayV2 +This is how you can build an Audio to Text app using Jina, Docarray and Whisper We will use: diff --git a/mkdocs.yml b/mkdocs.yml index 9e4209520ef..ca72a966197 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -82,5 +82,6 @@ nav: - how_to/add_doc_index.md - how_to/multimodal_training_and_serving.md - how_to/optimize_performance_with_id_generation.md + - how_to/audio2text.md - ... - Contributing: CONTRIBUTING.md