Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
79 changes: 73 additions & 6 deletions packages/cli/src/camkit.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
* camkit — Camtasia project CLI over @camkit/core + @camkit/darwin.
* Port of edit-videos/cam.ts with identical command behavior and output.
*
* camkit info|clips|sources|rebuild|silences|transcribe|status|close|open|docs
* camkit info|clips|sources|rebuild|silences|transcribe|status|close|open|docs|takes|words
*
* --project accepts a .cmproj dir or project.tscproj path; defaults to
* ./search.cmproj/project.tscproj. Read commands never mutate; rebuild backs
Expand All @@ -28,9 +28,13 @@ import {
projectInfo,
bundleName,
resolveProjectPath,
secondsToUnits,
segmentTakes,
tracks,
wordsInRange,
type KeepSeg,
} from "@camkit/core";
import { camtasiaDocPaths, camtasiaDocs, closeProject, openProject, projectStatus } from "@camkit/darwin";
import { camtasiaDocPaths, closeProject, openProject, projectStatus } from "@camkit/darwin";
import { exportAudio, runSilencedetect, transcribeRecording } from "./media.ts";
import { listPresets, resolvePreset } from "./presets.ts";
import { version } from "../package.json";
Expand Down Expand Up @@ -198,7 +202,26 @@ const HELP: Record<string, { usage: string; about: string[] }> = {
},
docs: {
usage: "camkit docs",
about: ["List all projects currently open in Camtasia. macOS only."],
about: ["List all projects currently open in Camtasia with their full", "paths. macOS only."],
},
takes: {
usage: "camkit takes <transcript.json> [gap]",
about: [
"Segment a transcript's word list into takes by splitting on inter-word",
"gaps larger than `gap` seconds (default 1.2). Prints one line per take:",
' [start-end] (dur Nw) text',
"Degenerate Whisper padding words (zero-length stamps at clip ends) are",
"stripped before boundaries are computed, so durations match audible",
"speech. Use this to find the clean final take of each beat.",
],
},
words: {
usage: "camkit words <transcript.json> <start> <end>",
about: [
"Print every word (with its index + timestamps) inside the inclusive",
"[start, end] window. Use it to set precise cut points inside a take,",
"isolate a clean tail, or inspect a stretched-word dead-air artifact.",
],
},
};

Expand All @@ -225,7 +248,9 @@ function printHelp(cmd?: string): void {
status: "is this project open in Camtasia? (exit 2 if so)",
close: "save-and-close the project document in Camtasia",
open: "(re)open the project in Camtasia",
docs: "list projects open in Camtasia",
docs: "list projects open in Camtasia (with paths)",
takes: "segment a transcript into takes by word gaps",
words: "print words in a time range from a transcript",
};
for (const [c, s] of Object.entries(summaries)) console.log(` ${c.padEnd(11)} ${s}`);
console.log();
Expand Down Expand Up @@ -480,12 +505,52 @@ function cmdOpen(argv: string[]) {
}

function cmdDocs() {
const docs = camtasiaDocs();
const docs = camtasiaDocPaths();
if (!docs.length) {
console.log("Camtasia is not running, or has no projects open.");
return;
}
for (const d of docs) console.log(d);
for (const d of docs) console.log(`${d.name}\t${d.path}`);
}

function cmdTakes(argv: string[]) {
const positional = argv.filter((a) => !a.startsWith("--"));
if (positional.length < 1) throw new Error("Usage: camkit takes <transcript.json> [gap]");
const file = resolve(positional[0]);
if (!existsSync(file)) throw new Error(`No such file: ${file}`);
const gap = positional[1] != null ? Number(positional[1]) : 1.2;
if (Number.isNaN(gap)) throw new Error("gap must be a number of seconds");

const transcript = JSON.parse(readFileSync(file, "utf8"));
if (!Array.isArray(transcript.words)) {
throw new Error(`${file} has no word-level "words" array (transcribe with whisper-1).`);
}
const takes = segmentTakes(transcript.words, gap);
for (const t of takes) {
console.log(`[${t.start.toFixed(2).padStart(7)}-${t.end.toFixed(2).padStart(7)}] (${(t.end - t.start).toFixed(1).padStart(5)}s ${String(t.words.length).padStart(3)}w) ${t.text}`);
}
}

function cmdWords(argv: string[]) {
const positional = argv.filter((a) => !a.startsWith("--"));
if (positional.length < 3) {
throw new Error("Usage: camkit words <transcript.json> <start> <end>");
}
const file = resolve(positional[0]);
if (!existsSync(file)) throw new Error(`No such file: ${file}`);
const start = Number(positional[1]);
const end = Number(positional[2]);
if (Number.isNaN(start) || Number.isNaN(end)) {
throw new Error("start and end must be numbers (seconds).");
}

const transcript = JSON.parse(readFileSync(file, "utf8"));
if (!Array.isArray(transcript.words)) {
throw new Error(`${file} has no word-level "words" array (transcribe with whisper-1).`);
}
for (const w of wordsInRange(transcript.words, start, end)) {
console.log(`${String(w.idx).padStart(4)} ${w.start.toFixed(2).padStart(7)}-${w.end.toFixed(2).padStart(7)} ${w.word}`);
}
}

const COMMANDS: Record<string, (argv: string[]) => void | Promise<void>> = {
Expand All @@ -501,6 +566,8 @@ const COMMANDS: Record<string, (argv: string[]) => void | Promise<void>> = {
close: cmdClose,
open: cmdOpen,
docs: cmdDocs,
takes: cmdTakes,
words: cmdWords,
};

const [cmd, ...rest] = process.argv.slice(2);
Expand Down
67 changes: 67 additions & 0 deletions packages/core/src/transcript.ts
Original file line number Diff line number Diff line change
Expand Up @@ -62,3 +62,70 @@ export function shapeTranscript(raw: any, source: string, model: string): Transc
segments: (raw.segments ?? []).map((s: any) => ({ id: s.id, start: s.start, end: s.end, text: s.text })),
};
}

/** Minimum word duration (seconds) to count as real speech. Whisper pads
* clip ends with degenerate zero-length words at a frozen timestamp; these
* must be stripped before computing take boundaries or the reported start,
* end, and word count are all wrong. */
export const DEGENERATE_THRESHOLD = 0.05;

/** Whether a word is degenerate (zero-length or near-zero). Whisper emits
* these as padding at clip ends — a cluster of words all sharing the same
* frozen timestamp (e.g. 20 words at 223.78). */
export function isDegenerate(w: TranscriptWord, threshold = DEGENERATE_THRESHOLD): boolean {
return w.end - w.start < threshold;
}

export interface Take {
start: number;
end: number;
words: TranscriptWord[];
text: string;
}

/**
* Segment a word list into takes by splitting on inter-word gaps larger than
* `gap` seconds (default 1.2). Degenerate tail words (Whisper padding — zero-
* length stamps at a frozen timestamp) are stripped from each take before
* boundaries are computed, so the reported start/end/duration reflect audible
* speech, not padding artifacts. Takes that are empty after stripping are
* dropped entirely.
*/
export function segmentTakes(words: TranscriptWord[], gap = 1.2): Take[] {
const takes: TranscriptWord[][] = [];
let cur: TranscriptWord[] = [];

for (const w of words) {
if (cur.length > 0 && w.start - cur[cur.length - 1].end > gap) {
takes.push(cur);
cur = [];
}
cur.push(w);
}
if (cur.length > 0) takes.push(cur);

return takes
.map((raw) => raw.filter((w) => !isDegenerate(w)))
.filter((w) => w.length > 0)
.map((words) => ({
start: words[0].start,
end: words[words.length - 1].end,
words,
text: words.map((w) => w.word).join(" "),
}));
}

/**
* Filter words that fall within [start, end] (inclusive on both ends).
* Each result includes the original index in the source array so callers
* can reference exact positions for cut-point decisions.
*/
export function wordsInRange(
words: TranscriptWord[],
start: number,
end: number,
): { idx: number; word: string; start: number; end: number }[] {
return words
.map((w, idx) => ({ idx, word: w.word, start: w.start, end: w.end }))
.filter((w) => w.start >= start && w.end <= end);
}
105 changes: 104 additions & 1 deletion packages/core/test/transcript.test.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,13 @@
import { expect, test } from "bun:test";
import { shapeTranscript, toSrt, type Transcript } from "../src/transcript.ts";
import {
shapeTranscript,
toSrt,
segmentTakes,
wordsInRange,
isDegenerate,
type Transcript,
type TranscriptWord,
} from "../src/transcript.ts";

test("shapeTranscript keeps the stable contract and tolerates missing fields", () => {
const raw = {
Expand Down Expand Up @@ -37,3 +45,98 @@ test("toSrt renders 1-based, comma-millisecond, blank-line-separated cues", () =
"\n",
);
});

const w = (word: string, start: number, end: number): TranscriptWord => ({ word, start, end });

test("segmentTakes splits on gaps larger than the threshold", () => {
const words = [
w("hello", 0, 0.5),
w("world", 0.6, 1.1),
// 2s gap
w("second", 3.1, 3.6),
w("take", 3.7, 4.0),
];
const takes = segmentTakes(words, 1.2);
expect(takes).toHaveLength(2);
expect(takes[0].start).toBe(0);
expect(takes[0].end).toBe(1.1);
expect(takes[0].text).toBe("hello world");
expect(takes[1].start).toBe(3.1);
expect(takes[1].end).toBe(4.0);
expect(takes[1].text).toBe("second take");
});

test("segmentTakes strips degenerate tail words before computing boundaries", () => {
// A take where Whisper padded the end with 20 words all at the same stamp.
const padding: TranscriptWord[] = Array.from({ length: 20 }, () => w("pad", 10.0, 10.0));
const words = [
w("real", 0, 0.5),
w("speech", 0.6, 1.0),
w("ends", 1.1, 1.4),
// 3s gap then degenerate cluster
...padding,
];
const takes = segmentTakes(words, 1.2);
// The degenerate cluster forms its own "take" but is entirely stripped,
// leaving only the real speech take.
expect(takes).toHaveLength(1);
expect(takes[0].start).toBe(0);
expect(takes[0].end).toBe(1.4);
expect(takes[0].text).toBe("real speech ends");
expect(takes[0].words).toHaveLength(3);
});

test("segmentTakes strips degenerate words mixed into a take tail", () => {
// Degenerate words at the end of a take (no gap separating them).
const words = [
w("audible", 5.0, 5.5),
w("words", 5.6, 6.0),
w("frozen1", 6.0, 6.0),
w("frozen2", 6.0, 6.0),
];
const takes = segmentTakes(words, 1.2);
expect(takes).toHaveLength(1);
expect(takes[0].end).toBe(6.0);
expect(takes[0].words).toHaveLength(2);
expect(takes[0].text).toBe("audible words");
});

test("segmentTakes drops takes that are entirely degenerate", () => {
const words = [
w("real", 0, 0.5),
w("real2", 0.6, 1.0),
// 5s gap then a pure degenerate cluster
w("d1", 6.0, 6.0),
w("d2", 6.0, 6.0),
];
const takes = segmentTakes(words, 1.2);
expect(takes).toHaveLength(1);
expect(takes[0].text).toBe("real real2");
});

test("segmentTakes handles empty input", () => {
expect(segmentTakes([])).toEqual([]);
});

test("isDegenerate detects zero-length and near-zero words", () => {
expect(isDegenerate(w("x", 10, 10))).toBe(true);
expect(isDegenerate(w("x", 10, 10.01))).toBe(true);
expect(isDegenerate(w("x", 10, 10.06))).toBe(false);
expect(isDegenerate(w("x", 10, 10.5))).toBe(false);
});

test("wordsInRange filters to the inclusive window and preserves indices", () => {
const words = [
w("zero", 0, 0.5),
w("one", 0.6, 1.0),
w("two", 1.1, 1.5),
w("three", 1.6, 2.0),
w("four", 2.1, 2.5),
];
const result = wordsInRange(words, 0.6, 2.0);
expect(result).toEqual([
{ idx: 1, word: "one", start: 0.6, end: 1.0 },
{ idx: 2, word: "two", start: 1.1, end: 1.5 },
{ idx: 3, word: "three", start: 1.6, end: 2.0 },
]);
});
14 changes: 14 additions & 0 deletions skills/SKILLS.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# Skills

Repo-distributed skills for camkit. Each lives in its own directory as `<name>/SKILL.md`. Add a one-line entry here when you add a skill.

Claude Code only auto-discovers skills under `.claude/skills/`, so to use these, symlink them in once per checkout:

```sh
mkdir -p .claude/skills
ln -s ../../skills/rough-cut .claude/skills/rough-cut
```

| Skill | What it does |
|-------|--------------|
| [rough-cut](rough-cut/SKILL.md) | Transcribe the on-timeline recordings of the open Camtasia project with Whisper, then cut silences, filler, false starts, and losing retakes into a tight rough cut. Optionally aligned to a script. |
Loading