Skip to content

Commit aca64b0

Browse files
committed
build(scraper): migrate to Bun runtime and dynamic image versioning
Bun executes TypeScript natively, eliminating the ncc compilation step and the pnpm dependency. The Containerfile is now a lean two-stage build (deps + production) that runs src/main.ts directly via bun, and uses --filter + --production to avoid pulling in the ~100 AWS SDK packages that belong to packages/cli. The scraper image version was previously a static Terraform variable requiring a manual commit to bump. It is now built fresh in CI at the same git_ref as the website, so the scraper always runs at the exact commit being deployed. The scrape-and-index Argo task receives the image tag as a runtime parameter instead of having it baked into the Terraform config, removing the risk of version drift between site content and the scraper that indexes it.
1 parent d6b3b54 commit aca64b0

12 files changed

Lines changed: 170 additions & 122 deletions

File tree

.dockerignore

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,6 @@
1717
!packages/reference-typescript/src
1818
!packages/reference-typescript/package.json
1919
!packages/reference-typescript/tsconfig.json
20-
!packages/eslint/package.json
21-
!packages/eslint/index.js
2220
!packages/bastion
2321
!.npmrc
2422
!package.json

bun.lock

Lines changed: 58 additions & 3 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

environments/production/us-east-2/demo_cicd/terragrunt.hcl

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,6 @@ inputs = {
3838
mongodb_atlas_public_key = local.secrets.mongodb_atlas_public_key
3939
mongodb_atlas_private_key = local.secrets.mongodb_atlas_private_key
4040
site_url = "https://panfactum.com"
41-
scraper_image_version = "586d06cee96633a26ffe0ce318d85f26c3f7c27d"
4241
module_bucket = dependency.module_bucket.outputs.bucket_name
4342
installer_bucket = dependency.installer_bucket.outputs.installer_bucket_name
4443
}

infrastructure/demo_cicd/scraper_runner.tf

Lines changed: 33 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,11 @@ module "run_scraper_workflow_spec" {
3333
{
3434
name = "algolia_index_name"
3535
description = "The index name in algolia to update"
36+
},
37+
38+
{
39+
name = "image_version"
40+
description = "The scraper image tag to run (typically the git commit hash built by scraper-builder)"
3641
}
3742
]
3843
}
@@ -53,19 +58,42 @@ module "run_scraper_workflow_spec" {
5358
templates = [
5459
{
5560
name = "scrape-and-index",
61+
inputs = {
62+
parameters = [
63+
{
64+
name = "image_version"
65+
}
66+
]
67+
}
5668
container = {
57-
image = "891377197483.dkr.ecr.us-east-2.amazonaws.com/scraper:${var.scraper_image_version}"
58-
command = ["node"]
59-
args = ["index.js", "{{workflow.parameters.sitemap_url}}", "{{workflow.parameters.algolia_index_name}}"]
69+
image = "891377197483.dkr.ecr.us-east-2.amazonaws.com/scraper:{{inputs.parameters.image_version}}"
70+
command = ["bun"]
71+
args = ["run", "src/main.ts", "{{workflow.parameters.sitemap_url}}", "{{workflow.parameters.algolia_index_name}}"]
6072
}
6173
},
6274
{
6375
name = "entry",
76+
inputs = {
77+
parameters = [
78+
{
79+
name = "image_version"
80+
default = "{{workflow.parameters.image_version}}"
81+
}
82+
]
83+
}
6484
dag = {
6585
tasks = [
6686
{
67-
name = "scrape-and-index"
68-
template : "scrape-and-index"
87+
name = "scrape-and-index"
88+
template = "scrape-and-index"
89+
arguments = {
90+
parameters = [
91+
{
92+
name = "image_version"
93+
value = "{{inputs.parameters.image_version}}"
94+
}
95+
]
96+
}
6997
}
7098
]
7199
}

infrastructure/demo_cicd/vars.tf

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -36,11 +36,6 @@ variable "pull_through_cache_enabled" {
3636
default = false
3737
}
3838

39-
variable "scraper_image_version" {
40-
description = "The version of the image to use for the scraper"
41-
type = string
42-
}
43-
4439
variable "algolia_app_id" {
4540
description = "The Algolia App ID for the search index"
4641
type = string

infrastructure/demo_cicd/website_astro_builder.tf

Lines changed: 27 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -161,6 +161,28 @@ module "astro_builder_workflow" {
161161
template = "build-images"
162162
},
163163

164+
# Build the scraper image at the same commit as the website so the
165+
# scrape-and-index step always runs the latest code. The
166+
# scraper-builder's clone.sh resolves git_ref to a commit hash and
167+
# tags the ECR image with it, so git_ref must be a commit hash for
168+
# the downstream scrape-and-index task to find the matching tag.
169+
# The sensor trigger passes body.after (a commit hash) for this.
170+
{
171+
name = "build-scraper"
172+
templateRef = {
173+
name = module.scraper_builder.name
174+
template = module.scraper_builder.entrypoint
175+
}
176+
arguments = {
177+
parameters = [
178+
{
179+
name = "git_ref"
180+
value = "{{workflow.parameters.git_ref}}"
181+
}
182+
]
183+
}
184+
},
185+
164186
{
165187
name = "scrape-and-index"
166188
templateRef = {
@@ -176,10 +198,14 @@ module "astro_builder_workflow" {
176198
{
177199
name = "algolia_index_name"
178200
value = "{{workflow.parameters.algolia_index_name}}"
201+
},
202+
{
203+
name = "image_version"
204+
value = "{{workflow.parameters.git_ref}}"
179205
}
180206
]
181207
}
182-
depends = "build-images"
208+
depends = "build-images && build-scraper"
183209
}
184210
]
185211
}

package.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,5 +12,5 @@
1212
"typescript": "5.6.2",
1313
"yaml": "2.7.1"
1414
},
15-
"workspaces": ["packages/cli"]
15+
"workspaces": ["packages/cli", "packages/scraper"]
1616
}

packages/scraper/Containerfile

Lines changed: 24 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -1,47 +1,43 @@
11
#################################################
22
## Base Image
33
#################################################
4-
FROM node:20-bullseye-slim AS base
4+
FROM oven/bun:1.2.5-slim AS base
55
WORKDIR /code
66

77
#################################################
88
## Dependencies Image
99
#################################################
1010
FROM base AS deps
1111

12-
# Install package manager
13-
RUN npm i -g pnpm@9.14.2
14-
15-
# Install Dependencies
16-
COPY package.json pnpm-lock.yaml .npmrc pnpm-workspace.yaml .
12+
# Workspace lockfile and members live at the repo root.
13+
# Copy the least-changing files first so layer caching survives source edits.
14+
# bun requires every workspace member's package.json to be present at install
15+
# time even when filtering, otherwise it errors with "Workspace not found".
16+
COPY package.json bun.lock ./
17+
COPY packages/cli/package.json packages/cli/
1718
COPY packages/scraper/package.json packages/scraper/
18-
COPY packages/eslint/ packages/eslint/
19-
RUN --mount=type=cache,id=pnpm,target=/code/.pnpm pnpm install --frozen-lockfile
20-
21-
#################################################
22-
## Development Image
23-
#################################################
24-
25-
FROM deps AS development
26-
WORKDIR /code/packages/scraper
27-
COPY packages/scraper/ .
28-
RUN npx ncc build src/main.ts -o dist
29-
30-
#################################################
31-
## Builder Image
32-
#################################################
3319

34-
FROM deps AS builder
35-
ENV NODE_ENV=production
36-
WORKDIR /code/packages/scraper
37-
COPY packages/scraper/ .
38-
RUN npx ncc build src/main.ts -o dist
20+
# --filter scrape installs only the scraper workspace's deps (skips the
21+
# 100+ AWS SDK packages that belong to packages/cli). --production drops
22+
# devDependencies (typescript, @types/node) since bun runs TypeScript natively.
23+
# The cache mount preserves bun's tarball cache across builds via the BuildKit
24+
# S3 cache backend, so reinstalls are essentially free.
25+
RUN --mount=type=cache,id=bun,target=/root/.bun/install/cache \
26+
bun install --frozen-lockfile --filter scrape --production
3927

4028
#################################################
4129
## Production Image
4230
#################################################
43-
4431
FROM base AS production
4532
ENV NODE_ENV=production
33+
34+
# COPY --link makes each layer content-addressable so they can be cached
35+
# independently of preceding layers. Layers are listed most-stable first
36+
# so source edits only invalidate the final layer.
37+
COPY --link --from=deps /code/node_modules ./node_modules
38+
COPY --link package.json ./
39+
COPY --link packages/scraper/package.json packages/scraper/tsconfig.json packages/scraper/
40+
COPY --link packages/scraper/src packages/scraper/src
41+
4642
WORKDIR /code/packages/scraper
47-
COPY --from=builder /code/packages/scraper/dist ./
43+
CMD ["bun", "run", "src/main.ts"]
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
# Per-Dockerfile dockerignore for the scraper image build.
2+
#
3+
# BuildKit loads this file from the --local dockerfile (packages/scraper/)
4+
# and applies it to the --local context (repo root). It takes precedence
5+
# over the root .dockerignore for this Containerfile only.
6+
#
7+
# We use an allowlist (deny-everything-then-allow) to keep the build
8+
# context as small as possible. Without this, the context includes the
9+
# 77MB packages/website/ tree which the scraper does not need.
10+
11+
*
12+
13+
# Root files needed for the bun workspace install
14+
!bun.lock
15+
!package.json
16+
17+
# All workspace members listed in root package.json must have a
18+
# package.json present at install time, even when filtering, otherwise
19+
# bun errors with "Workspace not found".
20+
!packages/cli/package.json
21+
22+
# The actual scraper sources
23+
!packages/scraper/package.json
24+
!packages/scraper/tsconfig.json
25+
!packages/scraper/src

packages/scraper/package.json

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,17 +3,15 @@
33
"type": "module",
44
"version": "1.0.0",
55
"description": "",
6-
"main": "index.js",
6+
"main": "src/main.ts",
77
"keywords": [],
88
"author": "",
99
"license": "AGPL",
1010
"scripts": {
11-
"dev": "tsx src/main.ts"
11+
"dev": "bun run src/main.ts"
1212
},
1313
"devDependencies": {
1414
"@types/node": "18.13.0",
15-
"@vercel/ncc": "0.38.2",
16-
"tsx": "4.19.1",
1715
"typescript": "5.6.2"
1816
},
1917
"dependencies": {

0 commit comments

Comments
 (0)