Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,22 @@
RegexOption.MULTILINE
)

// Matches a CommonMark fenced code block — opening fence at line start
// (allowing up to three leading spaces of indent) using ``` or ~~~,
// through to the matching closing fence on its own line, OR end of
// input if the fence is never closed (CommonMark allows this and
// implicitly closes at EOF).
//
// Why this exists: skill bodies routinely embed code samples that
// contain `[label](path)`-shaped strings (JS template literals,
// markdown rendered as a string in code, etc.) and resource-dir-
// prefixed strings ("scripts/legacy.py"). Those are illustrations,
// not real file references, and validating them as files turns any
// skill teaching code into a footgun.
private val FENCED_CODE_BLOCK = Regex(
"""(?ms)^[ \t]{0,3}(`{3,}|~{3,})[^\n]*(?:\n|$)(?:.*?(?:\n[ \t]{0,3}\1[ \t]*(?:\n|$)|\z))?"""

Check warning on line 55 in embabel-agent-skills/src/main/kotlin/com/embabel/agent/skills/support/InstructionFileReferenceExtractor.kt

View check run for this annotation

SonarQubeCloud / SonarCloud Code Analysis

Simplify this regular expression to reduce its complexity from 28 to the 20 allowed.

See more on https://sonarcloud.io/project/issues?id=embabel_embabel-agent&issues=AZ4B1xvkIp1hziIhwHBH&open=AZ4B1xvkIp1hziIhwHBH&pullRequest=1658

Check warning on line 55 in embabel-agent-skills/src/main/kotlin/com/embabel/agent/skills/support/InstructionFileReferenceExtractor.kt

View check run for this annotation

SonarQubeCloud / SonarCloud Code Analysis

Rework this part of the regex to not match the empty string.

See more on https://sonarcloud.io/project/issues?id=embabel_embabel-agent&issues=AZ4B1xvkIp1hziIhwHBI&open=AZ4B1xvkIp1hziIhwHBI&pullRequest=1658
)

/**
* Extract all file references from instruction text.
*
Expand All @@ -50,18 +66,22 @@
return emptySet()
}

// Strip fenced code blocks BEFORE running the extractors so that
// illustrative code samples don't pollute the reference set.
val withoutCode = FENCED_CODE_BLOCK.replace(instructions, "")

val references = mutableSetOf<String>()

// Extract markdown link targets that are local paths
MARKDOWN_LINK_PATTERN.findAll(instructions).forEach { match ->
MARKDOWN_LINK_PATTERN.findAll(withoutCode).forEach { match ->
val path = match.groupValues[2]
if (isLocalPath(path)) {
references.add(normalizePath(path))
}
}

// Extract inline resource paths
RESOURCE_PATH_PATTERN.findAll(instructions).forEach { match ->
RESOURCE_PATH_PATTERN.findAll(withoutCode).forEach { match ->
val path = match.groupValues[1]
references.add(normalizePath(path))
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -157,4 +157,140 @@ class InstructionFileReferenceExtractorTest {

assertEquals(setOf("scripts/build.sh"), result)
}

// ─── Fenced code blocks must not contribute file references ──────────
//
// Skill bodies routinely include code examples in ``` fences. Anything
// inside is a sample, not a reference — it must NOT be validated as a
// local file. Otherwise any skill teaching code (most of them) is
// forced to avoid `[label](path)`-shaped lines and `scripts/foo.x`
// strings inside its examples, which is a footgun.

@Test
fun `ignores markdown link inside fenced code block`() {
val instructions = """
Real reference: [the guide](references/guide.md).

Example code:

```javascript
const r = await fetch(url);
console.log(`- [${'$'}{hit.title}](${'$'}{hit.url})`);
```
""".trimIndent()

val result = InstructionFileReferenceExtractor.extract(instructions)

// Only the prose link counts — the JS template literal inside the
// fence must be left alone.
assertEquals(setOf("references/guide.md"), result)
}

@Test
fun `ignores resource path inside fenced code block`() {
val instructions = """
Run scripts/build.sh to compile.

```python
# Don't do this — it's just an illustration
subprocess.run(["python", "scripts/legacy.py"])
```
""".trimIndent()

val result = InstructionFileReferenceExtractor.extract(instructions)

// The prose mention is a real reference; the in-fence string is not.
assertEquals(setOf("scripts/build.sh"), result)
}

@Test
fun `ignores tilde-fenced code block`() {
val instructions = """
See [docs](references/docs.md).

~~~
scripts/oops.sh
[link](references/oops.md)
~~~
""".trimIndent()

val result = InstructionFileReferenceExtractor.extract(instructions)

assertEquals(setOf("references/docs.md"), result)
}

@Test
fun `ignores fence with language tag`() {
val instructions = """
```kotlin
// [Foo](references/foo.kt)
val x = "scripts/x.kt"
```
""".trimIndent()

val result = InstructionFileReferenceExtractor.extract(instructions)

assertTrue(result.isEmpty())
}

@Test
fun `handles unclosed fenced code block as code through end of input`() {
// CommonMark implicitly closes a fence at end of document. The
// extractor must follow the same rule — otherwise a malformed
// skill body would suddenly start treating its code as prose.
val instructions = """
Intro paragraph mentions [real](references/real.md).

```
scripts/never-real.py
[also](references/never-real.md)
""".trimIndent()

val result = InstructionFileReferenceExtractor.extract(instructions)

assertEquals(setOf("references/real.md"), result)
}

@Test
fun `handles multiple fenced blocks interleaved with prose`() {
val instructions = """
First, see [setup](references/setup.md).

```
scripts/in-fence-1.sh
```

Then run scripts/build.sh.

```bash
scripts/in-fence-2.sh
```

Finally consult assets/diagram.png.
""".trimIndent()

val result = InstructionFileReferenceExtractor.extract(instructions)

assertEquals(
setOf("references/setup.md", "scripts/build.sh", "assets/diagram.png"),
result,
)
}

@Test
fun `extracts references on the same line as a closing fence terminator`() {
// Defensive: prose immediately following the closing fence on the
// next line must still be scanned. Verifies the fence regex doesn't
// eat the trailing newline + following content.
val instructions = """
```
scripts/in-fence.sh
```
See [the docs](references/docs.md) for details.
""".trimIndent()

val result = InstructionFileReferenceExtractor.extract(instructions)

assertEquals(setOf("references/docs.md"), result)
}
}
Loading