Skip to content

Commit 64d8d6c

Browse files
committed
bug: download HF model during build
1 parent 44fb392 commit 64d8d6c

File tree

1 file changed

+20
-4
lines changed

1 file changed

+20
-4
lines changed

docker/entity-extraction-pipeline/Dockerfile

Lines changed: 20 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,19 +11,35 @@ COPY docker/entity-extraction-pipeline/requirements.txt .
1111
RUN pip install --no-cache-dir -r requirements.txt
1212
RUN python -m nltk.downloader stopwords
1313
RUN pip install https://huggingface.co/finding-fossils/metaextractor-spacy/resolve/main/en_metaextractor_spacy-any-py3-none-any.whl
14+
# install git-lfs to be able to clone model weights from huggingface
15+
RUN apt-get update && apt-get install -y git-lfs
16+
# download the HF model into /app/models/ner/metaextractor
17+
RUN mkdir -p ./models/ner/ \
18+
&& cd ./models/ner/ \
19+
&& git lfs install \
20+
&& git clone https://huggingface.co/finding-fossils/metaextractor
1421

1522
# Copy the entire repository folder into the container
1623
COPY src ./src
1724

25+
# Set default env variables for when running the container
26+
ENV USE_NER_MODEL_TYPE=huggingface
27+
ENV MAX_ARTICLES=-1
28+
ENV MAX_SENTENCES=-1
29+
1830
# non-root user control inspired from here: https://stackoverflow.com/questions/66349101/docker-non-root-user-does-not-have-writing-permissions-when-using-volumes
1931
# Create a non-root user that owns the input/outputs directory by default
2032
RUN useradd -r extraction-user # no specific user ID
21-
RUN mkdir ./inputs && chown extraction-user ./inputs
22-
RUN mkdir ./outputs && chown extraction-user ./outputs
33+
RUN mkdir /inputs && chown extraction-user /inputs
34+
RUN mkdir /outputs && chown extraction-user /outputs
2335
# Mount the "inputs" and "outputs" folders as volumes
24-
VOLUME ["./inputs", "./outputs"]
36+
VOLUME ["/inputs", "/outputs"]
2537

2638
# Set the entry point and command to run the script
2739
USER extraction-user
2840
RUN ls -alp /app
29-
ENTRYPOINT python src/pipeline/entity_extraction_pipeline.py --article_text_path ./inputs/ --output_path ./outputs/
41+
ENTRYPOINT python src/pipeline/entity_extraction_pipeline.py \
42+
--article_text_path /inputs/ \
43+
--output_path /outputs/ \
44+
--max_articles ${MAX_ARTICLES} \
45+
--max_sentences ${MAX_SENTENCES}

0 commit comments

Comments
 (0)