Skip to content

Commit

Permalink
Initial commit.
Browse files Browse the repository at this point in the history
  • Loading branch information
johnspurlock committed Nov 18, 2022
0 parents commit fae2763
Show file tree
Hide file tree
Showing 40 changed files with 11,777 additions and 0 deletions.
20 changes: 20 additions & 0 deletions .github/workflows/automated-tests-and-build.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
name: Automated tests and build
on: [push, pull_request]

permissions:
contents: write

jobs:
test-all-patterns-and-build:
runs-on: ubuntu-22.04
steps:
- uses: actions/checkout@v3
- name: Test all patterns
run: ./.github/workflows/test-all-patterns.sh
- name: Generate build artifacts
run: ./.github/workflows/generate-build-artifacts.sh
if: github.event_name == 'push'
- uses: stefanzweifel/git-auto-commit-action@v4
if: github.event_name == 'push'
with:
file_pattern: 'build/*.json'
11 changes: 11 additions & 0 deletions .github/workflows/generate-build-artifacts.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
#!/bin/sh

# abort on any non-zero exit code
set -e

# install deno (if we don't already have it)
DENO_VERSION="v1.27.2"
[ ! -d ./deno-$DENO_VERSION ] && curl -fsSL https://deno.land/x/install/install.sh | DENO_INSTALL=./deno-$DENO_VERSION sh -s $DENO_VERSION

# run the generate script
NO_COLOR=1 DENO_VERSION=$DENO_VERSION ./deno-$DENO_VERSION/bin/deno run -A .github/workflows/generate_build_artifacts.ts
56 changes: 56 additions & 0 deletions .github/workflows/generate_build_artifacts.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
// deno-lint-ignore-file no-explicit-any
import { join, fromFileUrl } from 'https://deno.land/[email protected]/path/mod.ts';

const src = join(fromFileUrl(import.meta.url), `../../../src`);
const build = join(fromFileUrl(import.meta.url), `../../../build`);

let changed = 0;
for (const type of [ 'apps', 'bots', 'browsers', 'devices', 'libraries', 'referrers' ]) {
const obj = JSON.parse(await Deno.readTextFile(join(src, `${type}.json`)));

// compute a version with only the core attributes needed in production
const runtimeContents = JSON.stringify(computeRuntimeContents(obj), undefined, 2);
if (await writeTextFileIfChanged(join(build, `${type}.runtime.json`), runtimeContents)) {
changed++;
}

// compute a version with only the examples
const examplesContents = JSON.stringify(computeExamplesContents(obj), undefined, 2);
if (await writeTextFileIfChanged(join(build, `${type}.examples.json`), examplesContents)) {
changed++;
}
}
console.log(`Changed ${changed} file(s)`);

function computeRuntimeContents(obj: any) {
const entries = obj.entries.map((v: unknown) => {
const { name, pattern, category } = v as Record<string, unknown>;
return { name, pattern, category };
});
return { entries };
}

function computeExamplesContents(obj: any) {
const entries = obj.entries.flatMap((v: unknown) => {
const { name, examples } = v as Record<string, unknown>;
return examples ? [ { name, examples } ] : [];
});
return { entries };
}

async function writeTextFileIfChanged(path: string, contents: string): Promise<boolean> {
const oldContents = await tryReadTextFile(path);
if (oldContents === contents) return false;
console.log(`Updating ${path}`);
await Deno.writeTextFile(path, contents);
return true;
}

async function tryReadTextFile(path: string): Promise<string | undefined> {
try {
return await Deno.readTextFile(path);
} catch (e) {
if (e instanceof Deno.errors.NotFound) return undefined;
throw e;
}
}
135 changes: 135 additions & 0 deletions .github/workflows/patterns_test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
import { fail } from 'https://deno.land/[email protected]/testing/asserts.ts';
import { join, fromFileUrl } from 'https://deno.land/[email protected]/path/mod.ts';

type Type = 'apps' | 'bots' | 'browsers' | 'devices' |'libraries' | 'referrers';

type Entry = { name: string, pattern: string, examples?: string[] };

Deno.test({
name: 'patterns',
fn: async () => {
const entriesByType = new Map<Type, Entry[]>();

// first, read and perform basic parsing/validation on each entries file
for (const type of [ 'apps', 'bots', 'browsers', 'devices', 'libraries', 'referrers' ] as Type[]) {
const filepath = join(fromFileUrl(import.meta.url), `../../../src/${type}.json`);
const txt = await Deno.readTextFile(filepath);
const obj = JSON.parse(txt);
if (!Array.isArray(obj.entries)) fail(`Bad top-level object: missing 'entries' array.`);
const names = new Set<string>();
const entries: Entry[] = [];
let i = 0;
for (const entry of obj.entries) {
const tag = `${type}.entry[${i}]`;
if (typeof entry !== 'object' || entry === null) fail(`Bad ${tag}: expected an object, found ${JSON.stringify(entry)}`);

const { name, pattern, description, examples, svg, comments, category, urls } = entry as Record<string, unknown>;

// name
if (typeof name !== 'string') fail(`Bad ${tag}.name: expected a string property, found ${JSON.stringify(entry)}`);
if (name.trim() !== name) fail(`Bad ${tag}.name: expected no leading or trailing whitespace, found ${name}`);
if (name === '') fail(`Bad ${tag}.name: expected a non-blank string`);
if (names.has(name.toLowerCase())) fail(`Bad ${tag}.name: expected a unique value, found ${name}`);
names.add(name.toLowerCase());

// pattern
if (typeof pattern !== 'string') fail(`Bad ${tag}.pattern: expected a string property, found ${JSON.stringify(entry)}`);
if (/^\s+$/.test(pattern)) fail(`Bad ${tag}.pattern: expected a non-blank string`);
if (pattern.includes('(?:')) fail(`Bad ${tag}.pattern: non-capturing groups are not supported in all environments`);
if (pattern.includes('(?=') || pattern.includes('(?!')) fail(`Bad ${tag}.pattern: lookaheads are not supported in all environments`);
if (pattern.includes('(?<=') || pattern.includes('(?<!')) fail(`Bad ${tag}.pattern: lookbehinds are not supported in all environments`);
if (pattern.includes('\\A')) fail(`Bad ${tag}.pattern: \\A (beginning of string) is not supported in all environments, use ^`);
if (pattern.includes('\\Z')) fail(`Bad ${tag}.pattern: \\Z (end of string or before trailing newline) is not supported in all environments, use $`);
if (pattern.includes('\\z')) fail(`Bad ${tag}.pattern: \\z (end of string) is not supported in all environments, use $`);
const regex = new RegExp(pattern);

// description
if (description !== undefined && typeof description !== 'string') fail(`Bad ${tag}.description: expected an optional string property, found ${JSON.stringify(entry)}`);
if (typeof description === 'string') {
if (description.trim() !== description) fail(`Bad ${tag}.description: expected no leading or trailing whitespace, found ${description}`);
if (description === '') fail(`Bad ${tag}.description: expected a non-blank string`);
}

// svg
if (svg !== undefined && typeof svg !== 'string') fail(`Bad ${tag}.svg: expected an optional string property, found ${JSON.stringify(entry)}`);
if (typeof svg === 'string') {
if (!/^[a-z]+(-[a-z]+)*\.svg$/.test(svg)) fail(`Bad ${tag}.svg: unexpected value ${JSON.stringify(svg)}`);
await Deno.stat(join(fromFileUrl(import.meta.url), `../../../svg/${svg}`));
}

// comments
if (comments !== undefined && typeof comments !== 'string') fail(`Bad ${tag}.comments: expected an optional string property, found ${JSON.stringify(entry)}`);
if (typeof comments === 'string') {
if (comments.trim() !== comments) fail(`Bad ${tag}.comments: expected no leading or trailing whitespace, found ${comments}`);
if (comments === '') fail(`Bad ${tag}.comments: expected a non-blank string`);
}

// examples
if (examples !== undefined) {
if (!Array.isArray(examples)) fail(`Bad ${tag}.examples: expected an array, found ${JSON.stringify(examples)}`);
examples.forEach((example: unknown, j: number) => {
if (typeof example !== 'string') fail(`Bad ${tag}.examples[${j}]: expected a string, found ${JSON.stringify(example)}`);
if (!regex.test(example)) fail(`Bad ${tag}.examples[${j}]: "${example}" does not match pattern "${pattern}"`);
});
}

// urls
if (urls !== undefined) {
if (!Array.isArray(urls)) fail(`Bad ${tag}.urls: expected an array, found ${JSON.stringify(urls)}`);
urls.forEach((url: unknown, j: number) => {
if (typeof url !== 'string') fail(`Bad ${tag}.urls[${j}]: expected a string, found ${JSON.stringify(url)}`);
if (!isValidUrl(url)) fail(`Bad ${tag}.urls[${j}]: expected url, found "${url}"`);
});
}

// category
if (category !== undefined && typeof category !== 'string') fail(`Bad ${tag}.category: expected an optional string property, found ${JSON.stringify(entry)}`);
if (typeof category === 'string') {
if (!/^[a-z]+(_[a-z]+)*$/.test(category)) fail(`Bad ${tag}.category: unexpected value ${JSON.stringify(category)}`);
}

entries.push({ name, pattern, examples });

i++;
}
entriesByType.set(type, entries);
}

// now that we know all files are valid, check deterministic match for each example
for (const [ type, entries ] of entriesByType) {
if (type === 'devices' || type === 'referrers') continue;
let i = 0;
for (const { name, examples } of entries) {
const tag = `${type}.entry[${i}]`;
let j = 0;
for (const example of examples ?? []) {
const match = computeDeterministicMatch(example, entriesByType);
if (!match || match.name !== name || match.type !== type) {
fail(`Bad ${tag}.examples[${j}]: ${name} "${example}" does not match itself, deterministic match ${JSON.stringify(match)}`);
}
j++;
}
i++;
}
}
}
});

function isValidUrl(url: string): boolean {
try {
const u = new URL(url);
return u.protocol === 'http:' || u.protocol === 'https:'
} catch {
return false;
}
}

function computeDeterministicMatch(userAgent: string, entriesByType: Map<Type, Entry[]>): { type: Type, name: string } | undefined {
for (const type of [ 'bots', 'apps', 'libraries', 'browsers' ] as Type[]) {
for (const { name, pattern } of entriesByType.get(type) ?? []) {
if (new RegExp(pattern).test(userAgent)) {
return { type, name };
}
}
}
}
11 changes: 11 additions & 0 deletions .github/workflows/test-all-patterns.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
#!/bin/sh

# abort on any non-zero exit code
set -e

# install deno (if we don't already have it)
DENO_VERSION="v1.27.2"
[ ! -d ./deno-$DENO_VERSION ] && curl -fsSL https://deno.land/x/install/install.sh | DENO_INSTALL=./deno-$DENO_VERSION sh -s $DENO_VERSION

# run all unit tests
NO_COLOR=1 DENO_VERSION=$DENO_VERSION ./deno-$DENO_VERSION/bin/deno test --allow-read
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
.DS_Store
deno-*
16 changes: 16 additions & 0 deletions .vscode/pattern.code-snippets
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
{
"New pattern entry": {
"scope": "json",
"prefix": "pattern",
"body": [
"{",
" \"name\": \"${1:name}\",",
" \"pattern\": \"${2:pattern}\",",
" \"examples\": [",
" \"${3:example}\"",
" ]",
"},",
],
"description": "Insert a new pattern entry"
}
}
6 changes: 6 additions & 0 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{
"deno.enable": true,
"files.exclude": {
"deno-*": true
}
}
21 changes: 21 additions & 0 deletions LICENSE
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
MIT License

Copyright (c) 2022 Open Podcast Analytics Working Group

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
52 changes: 52 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
# user-agents-v2

Comprehensive open-source collection of broadly-compatible regular expression patterns to identify and analyze podcast player user agents.

## Quick start

Given a HTTP [`User-Agent`](https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/User-Agent) found in your podcast episode server logs, to find a deterministic entity match:
- Remove any newlines (never occurs except from bad actors)
- Iterate the following json files from the [`src`](/src) directory in this order: [`bots`](/src/bots.json), [`apps`](/src/apps.json), [`libraries`](/src/libraries.json), [`browsers`](/src/browsers.json)
- Iterate the pattern file `entries` array in order, returning the first entry where `pattern` matches the User-Agent
- This will always result in either 0 or 1 entry
- If found, the containing file can be used as the `type` of the entry (e.g. `bot` if found in the [`bots`](/src/bots.json) file)

_(Optional)_ If `type` is not `bot`, to additionally break down by device:
- Iterate the [`devices`](/src/devices.json) pattern file `entries` array in order, returning the first entry where `pattern` matches the User-Agent
- This will always result in either 0 or 1 entry
- If found, the device will also have a `category` for high-level category such as `mobile`, `smart_speaker`, or `computer`

_(Optional)_ If `type` is `browser` and you also have the HTTP [`Referer`](https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Referer) header in your logs, to additionally break down by known web apps:
- Remove any newlines (never occurs except from bad actors)
- Iterate the [`referrers`](/src/referrers.json) pattern file `entries` array in order, returning the first entry where `pattern` matches the Referer
- This will always result in either 0 or 1 entry
- If found, the referrer entity may also have a `category` of `app` (for web-based apps) or `host` (for podcast hosting company players)

## Approach

This collection is an evolution of the original [OPAWG User agent list](https://github.com/opawg/user-agents), refactored in some ways and overlaid with ideas from the excellent [Buzzsprout Podcast Agents](https://github.com/buzzsprout/podcast-agent) Ruby gem.

Some of the goals of this collection:
- **Data files instead of code**: This is not an NPM package or dependent on any specific programming environment or platform, the patterns files are in standard JSON format, with a shared [JSON schema](/schemas/patterns.schema.json) (automatic type-checking and code-assist in IDEs). There is no code outside of the [`.github/workflows`](/.github/workflows) folder, which runs the automated tests on every change.
- **Deterministic**: Given a User-Agent, everyone should end up with the same unique result (0 or 1 entry), regardless of programming language or environment.
- **Fast and compatible**: Keep to basic regular expression patterns, avoid features such as lookaheads that are expensive or unavailable in certain environments.
- **Comprehensive**: Goal is to match the vast majority of current podcast user-agents in the wild.
- **Multi-dimensional**: While basic entity matching is deterministic, optional breakdowns by device and device category are separated out into a separate file - making the patterns simpler, and focusing on attributes useful for standard podcast stats reporting.
- **Web-aware**: Optionally identify known web-based apps and other players using the Referer header, given that web-based apps cannot set a custom User-Agent.
- **Testable**: Examples are included attached to the entries themselves, [automated checks](/.github/workflows/patterns_test.ts) are run against the patterns after every push and pull request, to ensure quality going forward.
- **Easy to contribute**: Help make this collection better by adding examples to an existing file, or adding new entries. Sanity checks will run automatically on any pending pull requests.

## Evolution

These patterns were initially created with a one-time automated transform of the original [OPAWG User agent list](https://github.com/opawg/user-agents), with the following transformations:
- Converted top-level array to multiple top-level files by type, each with a top-level object - easier to deserialize in some environments than a top-level array.
- Removed unnecessary forward-slash escaping `\/` in patterns.
- Merged duplicate entries into a single entry, then sorted alphabetically.
- Removed lookheads, re-ordered certain entries if necessary to emulate.
- Combined multiple expressions for a single entry into a single regex pattern (separated by `|`), simpler and faster than compiling multiple patterns per entry.
- Fixed some of the incorrect patterns.
- Dropped support for OPAWG "device" and "os" attributes. Instead, introduced a new [`devices`](/src/devices.json) entries file ported from Buzzsprout's Ruby code. Simplified patterns that no longer needed app+device+os specificity.
- Added a [JSON schema](/schemas/patterns.schema.json), fixed validation errors found - like incorrect info urls.
- Imported several new entries and examples from Buzzsprout's test data file.
- Ran against a large set of found data, and added yet more entries and examples.
- Fixed issues found when running against the new automated checks. In addition to basic JSON-level data checks, the automated tests ensure each example matches against its parent entry when running through the deterministic matching algorithm mentioned above.
11 changes: 11 additions & 0 deletions build/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# Build artifacts

These variant files are automatically generated from their corresponding entries JSON file over in the [src](/src) directory - so _no need to modify any of these by hand_.

Currently, two smaller JSON files are produced for every source entries file:
- a "runtime" variant with only the entry properties: `name`, `pattern`, and optional `category`
- Useful if you need a minimal version to include/reference in your production codebase
- an "examples" variant with only the entry properties: `name` and `examples`
- Useful if you want the examples separately


Loading

0 comments on commit fae2763

Please sign in to comment.