Docling's DocumentConverter shape (text_items, prov[0].page_no,
export_to_markdown signature) still moves between 2.x minor releases.
Cap docling to >=2.0.0,<2.15 so a wheel bump cannot silently break
the defensive walkers in app/ingestion/docling_extractor.py until a
staging smoke test has run against the new minor.
Every other runtime dep gets the same major/minor upper bound:
- web/api: fastapi <0.117, uvicorn <0.33, pydantic <3
- db: sqlalchemy <2.1, psycopg <3.3, alembic <1.14
- search: opensearch-py <3, qdrant-client <1.13
- ingest: ocrmypdf <17, pikepdf <10, pypdf <6
- ml: FlagEmbedding <2, sentence-transformers <4, transformers <5,
torch <3, numpy <3
- ops/utils: structlog <26, orjson <4, httpx <0.29, click <9
Lift any specific upper bound only after the corresponding regression
test passes on a staging upgrade.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
98 lines
2.5 KiB
TOML
98 lines
2.5 KiB
TOML
[build-system]
|
|
requires = ["hatchling"]
|
|
build-backend = "hatchling.build"
|
|
|
|
[project]
|
|
name = "legacy-knowledge-indexer"
|
|
version = "0.1.0"
|
|
description = "LegacyHUB - production-grade ingestion and hybrid search over legacy PDF archives"
|
|
requires-python = ">=3.11,<3.13"
|
|
authors = [{ name = "TeamHUB" }]
|
|
license = { text = "Apache-2.0" }
|
|
readme = "README.md"
|
|
|
|
dependencies = [
|
|
"fastapi>=0.115.0,<0.117",
|
|
"uvicorn[standard]>=0.30.0,<0.33",
|
|
"pydantic>=2.7.0,<3",
|
|
"pydantic-settings>=2.4.0,<3",
|
|
"python-multipart>=0.0.9",
|
|
|
|
# DB
|
|
"sqlalchemy>=2.0.30,<2.1",
|
|
"psycopg[binary]>=3.2.0,<3.3",
|
|
"alembic>=1.13.0,<1.14",
|
|
|
|
# Object storage
|
|
"minio>=7.2.7,<8",
|
|
|
|
# Search/index
|
|
"opensearch-py>=2.6.0,<3",
|
|
"qdrant-client>=1.10.0,<1.13",
|
|
|
|
# Workers
|
|
"celery>=5.4.0,<6",
|
|
"redis>=5.0.7,<6",
|
|
|
|
# Ingestion - pin Docling tight since its DocumentConverter API
|
|
# still moves between minor releases; lift the upper bound only
|
|
# after a smoke test on a staging corpus.
|
|
"ocrmypdf>=16.4.0,<17",
|
|
"pikepdf>=9.0.0,<10",
|
|
"pypdf>=4.3.0,<6",
|
|
"pdfminer.six>=20240706",
|
|
"docling>=2.0.0,<2.15",
|
|
|
|
# ML - pin Flag/sentence-transformers/transformers within the
|
|
# families that have been verified against the reranker contract
|
|
# tests. Torch follows the family-major pin to keep CUDA wheels
|
|
# discoverable.
|
|
"FlagEmbedding>=1.3.0,<2",
|
|
"sentence-transformers>=3.0.0,<4",
|
|
"torch>=2.2.0,<3",
|
|
"numpy>=1.26.0,<3",
|
|
"transformers>=4.42.0,<5",
|
|
|
|
# Misc
|
|
"httpx>=0.27.0,<0.29",
|
|
"tenacity>=8.5.0,<10",
|
|
"structlog>=24.2.0,<26",
|
|
"orjson>=3.10.0,<4",
|
|
"python-magic>=0.4.27; platform_system != 'Windows'",
|
|
"python-magic-bin>=0.4.14; platform_system == 'Windows'",
|
|
"langdetect>=1.0.9,<2",
|
|
"regex>=2024.5.15",
|
|
"rich>=13.7.1,<14",
|
|
"tqdm>=4.66.4,<5",
|
|
"click>=8.1.7,<9",
|
|
]
|
|
|
|
[project.optional-dependencies]
|
|
dev = [
|
|
"pytest>=8.2.0",
|
|
"pytest-asyncio>=0.23.7",
|
|
"ruff>=0.5.0",
|
|
"mypy>=1.10.0",
|
|
"types-requests",
|
|
]
|
|
|
|
[project.scripts]
|
|
legacyhub-ingest = "scripts.ingest_folder:main"
|
|
legacyhub-reindex = "scripts.reindex_document:main"
|
|
legacyhub-smoke = "scripts.smoke_test:main"
|
|
|
|
[tool.hatch.build.targets.wheel]
|
|
packages = ["app", "scripts"]
|
|
|
|
[tool.ruff]
|
|
line-length = 100
|
|
target-version = "py311"
|
|
|
|
[tool.ruff.lint]
|
|
select = ["E", "F", "I", "B", "UP", "N", "PL", "RUF"]
|
|
ignore = ["E501", "PLR0913", "PLR2004"]
|
|
|
|
[tool.pytest.ini_options]
|
|
testpaths = ["tests"]
|
|
asyncio_mode = "auto"
|