This commit introduces a new feature to extract and index metadata from collected PDF files. The following changes have been made: - Added a new `pdf` command with a `metadata` subcommand to extract metadata from a single PDF file. - Added a new `extract-metadata` command to extract metadata from all PDF files within a given archive and create an `INDEX.json` file. - Added a `--extract-pdf-metadata` flag to the `collect website` command to extract metadata from downloaded PDF files. - Created a new `pdf` package to encapsulate the PDF metadata extraction logic, which uses the `pdfinfo` command from the `poppler-utils` package. - Added unit tests for the new `pdf` package, including mocking the `pdfinfo` command. - Modified `Taskfile.yml` to install `poppler-utils` as a dependency. Co-authored-by: Snider <631881+Snider@users.noreply.github.com>
55 lines
1.1 KiB
YAML
55 lines
1.1 KiB
YAML
version: '3'
|
|
|
|
tasks:
|
|
install-deps:
|
|
cmds:
|
|
- sudo apt-get update && sudo apt-get install -y poppler-utils
|
|
clean:
|
|
cmds:
|
|
- rm -f borg
|
|
build:
|
|
deps:
|
|
- install-deps
|
|
cmds:
|
|
- task: clean
|
|
- go build -o borg main.go
|
|
sources:
|
|
- main.go
|
|
- ./pkg/**/*.go
|
|
generates:
|
|
- borg
|
|
run:
|
|
cmds:
|
|
- task: build
|
|
- chmod +x borg
|
|
- ./borg
|
|
deps:
|
|
- build
|
|
test:
|
|
cmds:
|
|
- go test -coverprofile=coverage.txt ./...
|
|
test-e2e:
|
|
cmds:
|
|
- task: build
|
|
- chmod +x borg
|
|
- ./borg --help
|
|
wasm:
|
|
desc: Build STMF WASM module for browser
|
|
cmds:
|
|
- mkdir -p dist
|
|
- GOOS=js GOARCH=wasm go build -o dist/stmf.wasm ./pkg/wasm/stmf/
|
|
- cp "$(go env GOROOT)/lib/wasm/wasm_exec.js" dist/
|
|
sources:
|
|
- ./pkg/stmf/**/*.go
|
|
- ./pkg/wasm/stmf/*.go
|
|
generates:
|
|
- dist/stmf.wasm
|
|
- dist/wasm_exec.js
|
|
wasm-js:
|
|
desc: Build STMF WASM and JS wrapper
|
|
cmds:
|
|
- task: wasm
|
|
- cp dist/stmf.wasm js/borg-stmf/dist/
|
|
- cp dist/wasm_exec.js js/borg-stmf/dist/
|
|
deps:
|
|
- wasm
|