diff --git a/packages/cli/src/camkit.ts b/packages/cli/src/camkit.ts index d069bdf..a6c3b60 100755 --- a/packages/cli/src/camkit.ts +++ b/packages/cli/src/camkit.ts @@ -3,7 +3,7 @@ * camkit — Camtasia project CLI over @camkit/core + @camkit/darwin. * Port of edit-videos/cam.ts with identical command behavior and output. * - * camkit info|clips|sources|rebuild|silences|transcribe|status|close|open|docs + * camkit info|clips|sources|rebuild|silences|transcribe|status|close|open|docs|takes|words * * --project accepts a .cmproj dir or project.tscproj path; defaults to * ./search.cmproj/project.tscproj. Read commands never mutate; rebuild backs @@ -28,9 +28,13 @@ import { projectInfo, bundleName, resolveProjectPath, + secondsToUnits, + segmentTakes, + tracks, + wordsInRange, type KeepSeg, } from "@camkit/core"; -import { camtasiaDocPaths, camtasiaDocs, closeProject, openProject, projectStatus } from "@camkit/darwin"; +import { camtasiaDocPaths, closeProject, openProject, projectStatus } from "@camkit/darwin"; import { exportAudio, runSilencedetect, transcribeRecording } from "./media.ts"; import { listPresets, resolvePreset } from "./presets.ts"; import { version } from "../package.json"; @@ -198,7 +202,26 @@ const HELP: Record = { }, docs: { usage: "camkit docs", - about: ["List all projects currently open in Camtasia. macOS only."], + about: ["List all projects currently open in Camtasia with their full", "paths. macOS only."], + }, + takes: { + usage: "camkit takes [gap]", + about: [ + "Segment a transcript's word list into takes by splitting on inter-word", + "gaps larger than `gap` seconds (default 1.2). Prints one line per take:", + ' [start-end] (dur Nw) text', + "Degenerate Whisper padding words (zero-length stamps at clip ends) are", + "stripped before boundaries are computed, so durations match audible", + "speech. Use this to find the clean final take of each beat.", + ], + }, + words: { + usage: "camkit words ", + about: [ + "Print every word (with its index + timestamps) inside the inclusive", + "[start, end] window. Use it to set precise cut points inside a take,", + "isolate a clean tail, or inspect a stretched-word dead-air artifact.", + ], }, }; @@ -225,7 +248,9 @@ function printHelp(cmd?: string): void { status: "is this project open in Camtasia? (exit 2 if so)", close: "save-and-close the project document in Camtasia", open: "(re)open the project in Camtasia", - docs: "list projects open in Camtasia", + docs: "list projects open in Camtasia (with paths)", + takes: "segment a transcript into takes by word gaps", + words: "print words in a time range from a transcript", }; for (const [c, s] of Object.entries(summaries)) console.log(` ${c.padEnd(11)} ${s}`); console.log(); @@ -480,12 +505,52 @@ function cmdOpen(argv: string[]) { } function cmdDocs() { - const docs = camtasiaDocs(); + const docs = camtasiaDocPaths(); if (!docs.length) { console.log("Camtasia is not running, or has no projects open."); return; } - for (const d of docs) console.log(d); + for (const d of docs) console.log(`${d.name}\t${d.path}`); +} + +function cmdTakes(argv: string[]) { + const positional = argv.filter((a) => !a.startsWith("--")); + if (positional.length < 1) throw new Error("Usage: camkit takes [gap]"); + const file = resolve(positional[0]); + if (!existsSync(file)) throw new Error(`No such file: ${file}`); + const gap = positional[1] != null ? Number(positional[1]) : 1.2; + if (Number.isNaN(gap)) throw new Error("gap must be a number of seconds"); + + const transcript = JSON.parse(readFileSync(file, "utf8")); + if (!Array.isArray(transcript.words)) { + throw new Error(`${file} has no word-level "words" array (transcribe with whisper-1).`); + } + const takes = segmentTakes(transcript.words, gap); + for (const t of takes) { + console.log(`[${t.start.toFixed(2).padStart(7)}-${t.end.toFixed(2).padStart(7)}] (${(t.end - t.start).toFixed(1).padStart(5)}s ${String(t.words.length).padStart(3)}w) ${t.text}`); + } +} + +function cmdWords(argv: string[]) { + const positional = argv.filter((a) => !a.startsWith("--")); + if (positional.length < 3) { + throw new Error("Usage: camkit words "); + } + const file = resolve(positional[0]); + if (!existsSync(file)) throw new Error(`No such file: ${file}`); + const start = Number(positional[1]); + const end = Number(positional[2]); + if (Number.isNaN(start) || Number.isNaN(end)) { + throw new Error("start and end must be numbers (seconds)."); + } + + const transcript = JSON.parse(readFileSync(file, "utf8")); + if (!Array.isArray(transcript.words)) { + throw new Error(`${file} has no word-level "words" array (transcribe with whisper-1).`); + } + for (const w of wordsInRange(transcript.words, start, end)) { + console.log(`${String(w.idx).padStart(4)} ${w.start.toFixed(2).padStart(7)}-${w.end.toFixed(2).padStart(7)} ${w.word}`); + } } const COMMANDS: Record void | Promise> = { @@ -501,6 +566,8 @@ const COMMANDS: Record void | Promise> = { close: cmdClose, open: cmdOpen, docs: cmdDocs, + takes: cmdTakes, + words: cmdWords, }; const [cmd, ...rest] = process.argv.slice(2); diff --git a/packages/core/src/transcript.ts b/packages/core/src/transcript.ts index 15cb0f1..561a3df 100644 --- a/packages/core/src/transcript.ts +++ b/packages/core/src/transcript.ts @@ -62,3 +62,70 @@ export function shapeTranscript(raw: any, source: string, model: string): Transc segments: (raw.segments ?? []).map((s: any) => ({ id: s.id, start: s.start, end: s.end, text: s.text })), }; } + +/** Minimum word duration (seconds) to count as real speech. Whisper pads + * clip ends with degenerate zero-length words at a frozen timestamp; these + * must be stripped before computing take boundaries or the reported start, + * end, and word count are all wrong. */ +export const DEGENERATE_THRESHOLD = 0.05; + +/** Whether a word is degenerate (zero-length or near-zero). Whisper emits + * these as padding at clip ends — a cluster of words all sharing the same + * frozen timestamp (e.g. 20 words at 223.78). */ +export function isDegenerate(w: TranscriptWord, threshold = DEGENERATE_THRESHOLD): boolean { + return w.end - w.start < threshold; +} + +export interface Take { + start: number; + end: number; + words: TranscriptWord[]; + text: string; +} + +/** + * Segment a word list into takes by splitting on inter-word gaps larger than + * `gap` seconds (default 1.2). Degenerate tail words (Whisper padding — zero- + * length stamps at a frozen timestamp) are stripped from each take before + * boundaries are computed, so the reported start/end/duration reflect audible + * speech, not padding artifacts. Takes that are empty after stripping are + * dropped entirely. + */ +export function segmentTakes(words: TranscriptWord[], gap = 1.2): Take[] { + const takes: TranscriptWord[][] = []; + let cur: TranscriptWord[] = []; + + for (const w of words) { + if (cur.length > 0 && w.start - cur[cur.length - 1].end > gap) { + takes.push(cur); + cur = []; + } + cur.push(w); + } + if (cur.length > 0) takes.push(cur); + + return takes + .map((raw) => raw.filter((w) => !isDegenerate(w))) + .filter((w) => w.length > 0) + .map((words) => ({ + start: words[0].start, + end: words[words.length - 1].end, + words, + text: words.map((w) => w.word).join(" "), + })); +} + +/** + * Filter words that fall within [start, end] (inclusive on both ends). + * Each result includes the original index in the source array so callers + * can reference exact positions for cut-point decisions. + */ +export function wordsInRange( + words: TranscriptWord[], + start: number, + end: number, +): { idx: number; word: string; start: number; end: number }[] { + return words + .map((w, idx) => ({ idx, word: w.word, start: w.start, end: w.end })) + .filter((w) => w.start >= start && w.end <= end); +} diff --git a/packages/core/test/transcript.test.ts b/packages/core/test/transcript.test.ts index 41ce355..d3648d5 100644 --- a/packages/core/test/transcript.test.ts +++ b/packages/core/test/transcript.test.ts @@ -1,5 +1,13 @@ import { expect, test } from "bun:test"; -import { shapeTranscript, toSrt, type Transcript } from "../src/transcript.ts"; +import { + shapeTranscript, + toSrt, + segmentTakes, + wordsInRange, + isDegenerate, + type Transcript, + type TranscriptWord, +} from "../src/transcript.ts"; test("shapeTranscript keeps the stable contract and tolerates missing fields", () => { const raw = { @@ -37,3 +45,98 @@ test("toSrt renders 1-based, comma-millisecond, blank-line-separated cues", () = "\n", ); }); + +const w = (word: string, start: number, end: number): TranscriptWord => ({ word, start, end }); + +test("segmentTakes splits on gaps larger than the threshold", () => { + const words = [ + w("hello", 0, 0.5), + w("world", 0.6, 1.1), + // 2s gap + w("second", 3.1, 3.6), + w("take", 3.7, 4.0), + ]; + const takes = segmentTakes(words, 1.2); + expect(takes).toHaveLength(2); + expect(takes[0].start).toBe(0); + expect(takes[0].end).toBe(1.1); + expect(takes[0].text).toBe("hello world"); + expect(takes[1].start).toBe(3.1); + expect(takes[1].end).toBe(4.0); + expect(takes[1].text).toBe("second take"); +}); + +test("segmentTakes strips degenerate tail words before computing boundaries", () => { + // A take where Whisper padded the end with 20 words all at the same stamp. + const padding: TranscriptWord[] = Array.from({ length: 20 }, () => w("pad", 10.0, 10.0)); + const words = [ + w("real", 0, 0.5), + w("speech", 0.6, 1.0), + w("ends", 1.1, 1.4), + // 3s gap then degenerate cluster + ...padding, + ]; + const takes = segmentTakes(words, 1.2); + // The degenerate cluster forms its own "take" but is entirely stripped, + // leaving only the real speech take. + expect(takes).toHaveLength(1); + expect(takes[0].start).toBe(0); + expect(takes[0].end).toBe(1.4); + expect(takes[0].text).toBe("real speech ends"); + expect(takes[0].words).toHaveLength(3); +}); + +test("segmentTakes strips degenerate words mixed into a take tail", () => { + // Degenerate words at the end of a take (no gap separating them). + const words = [ + w("audible", 5.0, 5.5), + w("words", 5.6, 6.0), + w("frozen1", 6.0, 6.0), + w("frozen2", 6.0, 6.0), + ]; + const takes = segmentTakes(words, 1.2); + expect(takes).toHaveLength(1); + expect(takes[0].end).toBe(6.0); + expect(takes[0].words).toHaveLength(2); + expect(takes[0].text).toBe("audible words"); +}); + +test("segmentTakes drops takes that are entirely degenerate", () => { + const words = [ + w("real", 0, 0.5), + w("real2", 0.6, 1.0), + // 5s gap then a pure degenerate cluster + w("d1", 6.0, 6.0), + w("d2", 6.0, 6.0), + ]; + const takes = segmentTakes(words, 1.2); + expect(takes).toHaveLength(1); + expect(takes[0].text).toBe("real real2"); +}); + +test("segmentTakes handles empty input", () => { + expect(segmentTakes([])).toEqual([]); +}); + +test("isDegenerate detects zero-length and near-zero words", () => { + expect(isDegenerate(w("x", 10, 10))).toBe(true); + expect(isDegenerate(w("x", 10, 10.01))).toBe(true); + expect(isDegenerate(w("x", 10, 10.06))).toBe(false); + expect(isDegenerate(w("x", 10, 10.5))).toBe(false); +}); + +test("wordsInRange filters to the inclusive window and preserves indices", () => { + const words = [ + w("zero", 0, 0.5), + w("one", 0.6, 1.0), + w("two", 1.1, 1.5), + w("three", 1.6, 2.0), + w("four", 2.1, 2.5), + ]; + const result = wordsInRange(words, 0.6, 2.0); + expect(result).toEqual([ + { idx: 1, word: "one", start: 0.6, end: 1.0 }, + { idx: 2, word: "two", start: 1.1, end: 1.5 }, + { idx: 3, word: "three", start: 1.6, end: 2.0 }, + ]); +}); diff --git a/skills/SKILLS.md b/skills/SKILLS.md new file mode 100644 index 0000000..990d406 --- /dev/null +++ b/skills/SKILLS.md @@ -0,0 +1,14 @@ +# Skills + +Repo-distributed skills for camkit. Each lives in its own directory as `/SKILL.md`. Add a one-line entry here when you add a skill. + +Claude Code only auto-discovers skills under `.claude/skills/`, so to use these, symlink them in once per checkout: + +```sh +mkdir -p .claude/skills +ln -s ../../skills/rough-cut .claude/skills/rough-cut +``` + +| Skill | What it does | +|-------|--------------| +| [rough-cut](rough-cut/SKILL.md) | Transcribe the on-timeline recordings of the open Camtasia project with Whisper, then cut silences, filler, false starts, and losing retakes into a tight rough cut. Optionally aligned to a script. | diff --git a/skills/rough-cut/SKILL.md b/skills/rough-cut/SKILL.md new file mode 100644 index 0000000..ded1692 --- /dev/null +++ b/skills/rough-cut/SKILL.md @@ -0,0 +1,116 @@ +--- +name: rough-cut +description: Rough-cut the Camtasia project currently open in Camtasia — transcribe the on-timeline recordings with Whisper, then cut silences, filler, false starts, and bad retakes so the talk flows. Use when the user says "rough cut", "tidy the recording", "cut the silences/filler", "clean up the timeline", or similar. Optionally aligned to a script the user supplies; works without one too. +--- + +# rough-cut + +Turn raw long-take recordings on a Camtasia timeline into a tight rough cut, using `camkit`. The user records long, this cuts the dead air, filler, false starts, and losing takes so the result flows. + +## Hard rules + +- **Only edit what is on the timeline.** Run `camkit clips` / `camkit sources` and operate on `on timeline` sources only. Never touch `bin only` sources. +- **Silences are the #1 failure.** Whisper folds pauses into stretched word timestamps, so word gaps alone hide dead air. You have missed silences before. For every source you MUST run `camkit silences` and use ITS timestamps to find/cut dead air. Do not trust the transcript's word times to find pauses. +- **Always `--dry-run` first.** `camkit rebuild` is destructive; review the plan before writing. +- **Script is optional.** If the user gives a script path, align to it (pick the take matching each line, keep script order, drop retakes). If not, just cut silences + filler + false starts and keep the natural order. + +## Workflow + +### 1. Find the open project +```sh +camkit status # confirms Camtasia is running + which doc is open +camkit docs # open .cmproj names + full paths +``` +`camkit docs` prints `\t` per open project. Capture the path for the project you want to cut: +```sh +P=$(camkit docs | grep '' | cut -f2) +``` +Use it as `--project` for every command, or rely on the read-command fallback to the open project. Keep the path in a shell var. + +### 2. Inspect the timeline +```sh +camkit clips --project "$P" # what's laid down, in order, with src= ids + .trec paths +camkit sources --project "$P" # which sources are on timeline vs bin only +camkit info --project "$P" +``` +Note each on-timeline `src=N` and its `.trec` path. Two-track screen+camera sources show on both tracks — that's fine, `rebuild` clones every track a source touches, so sync is preserved. Reference the source ONCE in the keep list. + +### 3. Transcribe + detect silences for each on-timeline source +Create a scratch dir in the project (survives reboots, scoped to this project): +```sh +RC="$P/.camkit/rc" +mkdir -p "$RC" +``` +For every on-timeline source (run these in parallel — they're independent): +```sh +camkit transcribe "" --out "$RC/srcN.json" # word-level Whisper (OpenAI whisper-1) +camkit silences "" --db -35 --min 0.4 # dead-air ranges from ffmpeg +``` +Loop over them in one backgrounded batch and `wait`; ~45 min across 8 sources finishes in a couple of minutes. +- `--db` / `--min` tune sensitivity. Start `-35 dB`, `0.4 s`. Adjust if needed (quieter mic → `-30`; only long pauses → `--min 0.8`). +- The transcript JSON is `{text, words:[{word,start,end}], segments}`. Use word times for content boundaries; use `silences` for pauses. +- **`silences` output format** is `silence START-ENDs (DURs)` per line (e.g. `silence 12.30-15.40s (3.10s)`). Parse the two float timestamps, not raw `silence_start:` lines. + +#### The dead-air-inside-a-take trap (the silences you've missed) +Whisper does NOT emit a gap for a pause mid-sentence — it **stretches one word** to span it. A line in the words dump like `233.00-239.74 of` (a 6.7 s "of") is 6 s of silence hiding inside a kept take. Two defences, use both: +1. For every kept range, scan the word list inside it for any single word whose `end-start` is large (> ~1 s). Split the range around it. +2. Cross-check each kept range against the `silences` list; if a silence sits inside it, split it out. + +### 4. Read the script (if provided) + find the keeper takes + +These recordings are **heavy retake material**: the presenter says each beat many times, restarting, until the last pass is clean. The keeper for a beat is almost always the **final complete clean delivery**; everything before it is false starts to cut. + +Reading 3000+ raw words per source into context is wasteful. Two `camkit` subcommands make it tractable: +- **`camkit takes [gap]`** — segments a source's words into takes by splitting on word-gaps > `gap` (default 1.2 s) and prints `[start-end] (dur Nw) text` per take. Degenerate Whisper padding words are stripped automatically. Collapses the chaos to a readable list; the keeper is usually the last full take of each beat. +- **`camkit words `** — prints `idx start-end word` for words in `[start,end]`. Use it to set precise cut points inside a take (isolating a clean tail from leading stammers, or splitting out stretched-word dead air). + +```sh +camkit takes "$RC/src5.json" # scan the takes for src 5 +camkit words "$RC/src5.json" 120.0 140.0 # drill into a specific range +``` + +Map script lines to takes, pick the final clean delivery of each, drop retakes. Honor the script's order — `rebuild` lays kept ranges in the order you list them. (When no script: keep the natural take order, still picking the clean final pass of each beat.) + +Whisper pads clip ends with degenerate zero-length words at one frozen timestamp (e.g. 20 words all at `223.78`). `camkit takes` already strips these — but when building keep ranges by hand, end the last range before they start. + +### 5. Build the keep list +Write the plan as a JSON file for `--from` (cleaner than a long `--keep` string): `{"keep":[{"src":N,"start":S,"end":E}, ...]}` in final playback order. For each kept span: +- **Trim dead air** at the head/tail using the `silences` ranges, not word times. +- **Drop long mid-span pauses** by splitting one span into two around the silence (`N:a-b N:c-d`). +- **Cut filler** ("um", "uh", "so", "you know", false starts, restarts, "let me redo that"). +- **Cut losing retakes** entirely. +- Leave ~0.15-0.25 s of breath at cut points so it doesn't sound clipped. +- Cross-check: for every kept range, confirm no `silences` entry and no stretched word sits inside it un-cut. If one does, split it out. This is the step that catches the silences you've missed before. + +```sh +camkit rebuild --project "$P" --from "$RC/keep.json" --dry-run +``` + +### 6. Dry-run, review, apply +```sh +camkit rebuild --project "$P" --from "$RC/keep.json" --dry-run # read the plan: segment count, total duration +``` +Writing needs Camtasia to release the project. The close→rebuild handoff has a lock quirk — handle it: +```sh +camkit close --project "$P" # save-and-close in Camtasia +camkit docs # confirm: no open documents +``` +Camtasia releases the document but often leaves a **stale `~project.tscproj` lock file** behind, so an immediate `rebuild` fails with "Lock file present". Once `camkit docs` shows nothing open, that lock is stale and `--force` is correct: +```sh +camkit rebuild --project "$P" --from "$RC/keep.json" --force # backs up to project.tscproj.bak, then writes +camkit open --project "$P" # reopen for the user to review +camkit info --project "$P" | grep duration # sanity-check the new duration +``` +Only `--force` past the lock once `camkit docs` confirms the project is closed — never while it's genuinely open in Camtasia. **Never script or automate `--force`** — it must only follow a human-readable `camkit docs` showing no open documents. + +## Recutting +`rebuild` backs up to `project.tscproj.bak`. To cut again from the ORIGINAL (not the already-cut file), restore first or you'll cut the cut: +```sh +cp "$P/project.tscproj.bak" "$P/project.tscproj" +``` +Then redo from step 5. Transcripts/silences from step 3 are still valid (sources are untouched), so no need to re-transcribe. + +## Notes +- `transcribe` needs `ffmpeg` + an engine: `OPENAI_API_KEY` (whisper-1, best) or local `whisper-cpp`. It never mutates source media. +- Long recordings: transcription of ~45 min across several sources takes minutes and costs OpenAI credits. Run sources concurrently. +- Sanity-check the dry-run duration against your expectation (a tight cut of a 45 min take is usually 12–25 min). A surprising number means the keep list is wrong.