From b2bfb824d2130cfdd22d97f44ae19508a6e8f01e Mon Sep 17 00:00:00 2001 From: k8n Date: Mon, 9 Jun 2025 09:53:12 -0400 Subject: [PATCH 1/6] An option select specific content types to convert --- README.md | 12 ++++++++++++ src/parser.js | 10 ++++++++-- src/questions.js | 6 ++++++ 3 files changed, 26 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 72fa0634..17aebe0a 100644 --- a/README.md +++ b/README.md @@ -145,6 +145,18 @@ Allowed values: - A comma separated list with any of the following: `author`, `categories`, `coverImage`, `date`, `draft`, `excerpt`, `id`, `slug`, `tags`, `title`, `type`. You can rename a field by appending `:` and the alias to use. For example, `date:created` will rename `date` to `created`. +### Specific content types + +``` +--post-types=post,page +``` + +Comma separated list of the content types to include in Markdown files. Leave empty to include all default content types. + +Allowed values: + +- A comma separated list: `post`, `page`, etc. + ### Delay between image file requests? ``` diff --git a/src/parser.js b/src/parser.js index 3363c054..d7908e4b 100644 --- a/src/parser.js +++ b/src/parser.js @@ -12,7 +12,13 @@ export async function parseFilePromise() { const rssData = await data.load(content); const allPostData = rssData.child('channel').children('item'); - const postTypes = getPostTypes(allPostData); + let postTypes = getPostTypes(allPostData); + if (shared.config.postTypes?.length) { + postTypes = postTypes.filter((postType) => + shared.config.postTypes.includes(postType) + ); + } + const posts = collectPosts(allPostData, postTypes); const images = []; @@ -139,7 +145,7 @@ function collectScrapedImages(allPostData, postTypes) { postTypes.forEach((postType) => { getItemsOfType(allPostData, postType).forEach((postData) => { const postId = postData.childValue('post_id'); - + const postContent = postData.childValue('encoded'); const scrapedUrls = [...postContent.matchAll(/]+?(?<=\s)src="(.+?)"[^>]*>/gi)].map((match) => match[1]); scrapedUrls.forEach((scrapedUrl) => { diff --git a/src/questions.js b/src/questions.js index e9bec84d..e56a001e 100644 --- a/src/questions.js +++ b/src/questions.js @@ -112,6 +112,12 @@ export function load() { description: 'Frontmatter fields', default: 'title,date,categories,tags,coverImage,draft' }, + { + name: 'post-types', + type: 'list', + description: 'Post types to convert (empty for all)', + default: [] + }, { name: 'request-delay', type: 'integer', From 4433985b8196726fd6515badf0b5ff59cc84db0b Mon Sep 17 00:00:00 2001 From: k8n Date: Mon, 9 Jun 2025 22:18:25 -0400 Subject: [PATCH 2/6] Ability to exclude specific post types --- README.md | 8 ++++++++ src/parser.js | 7 +++++++ src/questions.js | 6 ++++++ 3 files changed, 21 insertions(+) diff --git a/README.md b/README.md index 17aebe0a..f07b5122 100644 --- a/README.md +++ b/README.md @@ -157,6 +157,14 @@ Allowed values: - A comma separated list: `post`, `page`, etc. +### Exclude specific content types + +``` +--exclude-post-types=nf_sub,et_pb_layout,acf-post-type,acf-field,acf-field-group,rm_content_editor,rank_math_schema +``` + +Comma separated list of the content types to exclude from Markdown files. Leave empty to include all default content types. + ### Delay between image file requests? ``` diff --git a/src/parser.js b/src/parser.js index d7908e4b..42d64cfb 100644 --- a/src/parser.js +++ b/src/parser.js @@ -13,12 +13,19 @@ export async function parseFilePromise() { const allPostData = rssData.child('channel').children('item'); let postTypes = getPostTypes(allPostData); + if (shared.config.postTypes?.length) { postTypes = postTypes.filter((postType) => shared.config.postTypes.includes(postType) ); } + if (shared.config.excludePostTypes?.length) { + postTypes = postTypes.filter((postType) => + ! shared.config.postTypes.includes(postType) + ); + } + const posts = collectPosts(allPostData, postTypes); const images = []; diff --git a/src/questions.js b/src/questions.js index e56a001e..87873a0a 100644 --- a/src/questions.js +++ b/src/questions.js @@ -118,6 +118,12 @@ export function load() { description: 'Post types to convert (empty for all)', default: [] }, + { + name: 'exclude-post-types', + type: 'list', + description: 'Post types to convert (empty for all)', + default: [] + }, { name: 'request-delay', type: 'integer', From c1ace25b9dfe719a82bbb60a6ad47710a11ace58 Mon Sep 17 00:00:00 2001 From: k8n Date: Mon, 9 Jun 2025 22:14:58 -0400 Subject: [PATCH 3/6] Output arbitrary WP post meta into frontmatter --- README.md | 8 ++++++++ package.json | 1 + src/parser.js | 30 ++++++++++++++++++++++++++++-- src/questions.js | 7 +++++++ src/writer.js | 12 ++++++++++++ 5 files changed, 56 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index f07b5122..e087679a 100644 --- a/README.md +++ b/README.md @@ -145,6 +145,14 @@ Allowed values: - A comma separated list with any of the following: `author`, `categories`, `coverImage`, `date`, `draft`, `excerpt`, `id`, `slug`, `tags`, `title`, `type`. You can rename a field by appending `:` and the alias to use. For example, `date:created` will rename `date` to `created`. +### Frontmatter meta + +``` +--frontmatter-meta=rank_math_seo_score:seo_score,rank_math_contentai_score +``` + +Comma separated list of the WP post meta values to include in the frontmatter of Markdown files. Serialized PHP arrays get unserialized and converted to corresponding YAML structures. + ### Specific content types ``` diff --git a/package.json b/package.json index 4677077e..4395cb93 100644 --- a/package.json +++ b/package.json @@ -24,6 +24,7 @@ "chalk": "^5.4.1", "commander": "^13.1.0", "luxon": "^3.5.0", + "php-serialize": "^5.1.3", "turndown": "^7.2.0", "xml2js": "^0.6.2" }, diff --git a/src/parser.js b/src/parser.js index 42d64cfb..6c46a3ba 100644 --- a/src/parser.js +++ b/src/parser.js @@ -5,6 +5,7 @@ import * as data from './data.js'; import * as frontmatter from './frontmatter.js'; import * as shared from './shared.js'; import * as translator from './translator.js'; +import { unserialize } from 'php-serialize' export async function parseFilePromise() { shared.logHeading('Parsing'); @@ -115,7 +116,7 @@ function buildPost(data) { // these are possibly set later in mergeImagesIntoPosts() coverImage: undefined, - imageUrls: [] + imageUrls: [], }; } @@ -127,7 +128,22 @@ function getPostDate(data) { function getPostMetaValue(data, key) { const metas = data.children('postmeta'); const meta = metas.find((meta) => meta.childValue('meta_key') === key); - return meta ? meta.childValue('meta_value') : undefined; + + const raw = meta ? meta.childValue('meta_value') : undefined; + + // If it looks like a PHP-serialized array/object, deserialize it + if (typeof raw === 'string' && /^a:\d+:/.test(raw.trim())) { + try { + // Note: you must have installed `php-serialize` + const parsed = unserialize(raw) + return parsed + } catch (e) { + // fallback to the raw string if unserialization fails + console.log(`Failed to unserialize meta value for key "${key}": ${raw}`); + } + } + + return raw; } function collectAttachedImages(allPostData) { @@ -217,6 +233,16 @@ function populateFrontmatter(posts) { post.frontmatter[alias ?? key] = frontmatterGetter(post); }); + + + // Handling for meta fields + shared.config.frontmatterMeta.forEach((field) => { + const [key, alias] = field.split(':'); + const value = getPostMetaValue(post.data, key); + if (value !== undefined && value !== null && value !== '') { + post.frontmatter[alias ?? key] = value; + } + }); }); } diff --git a/src/questions.js b/src/questions.js index 87873a0a..2fe341d2 100644 --- a/src/questions.js +++ b/src/questions.js @@ -112,6 +112,13 @@ export function load() { description: 'Frontmatter fields', default: 'title,date,categories,tags,coverImage,draft' }, + { + name: 'frontmatter-meta', + type: 'list', + description: 'Meta fields to add to frontmatter', + // e.g. 'rank_math_seo_score,rank_math_contentai_score' + default: [] + }, { name: 'post-types', type: 'list', diff --git a/src/writer.js b/src/writer.js index cb73b131..a9e75066 100644 --- a/src/writer.js +++ b/src/writer.js @@ -96,6 +96,18 @@ async function loadMarkdownFilePromise(post) { } else if (typeof value === 'boolean') { // output unquoted outputValue = value.toString(); + } else if (value !== null && typeof value === 'object') { + // Nested objects → YAML mappings + outputValue = "" + for (const [subKey, subVal] of Object.entries(value)) { + outputValue += `\n ${subKey}: ${subVal}` + } + } else if (typeof value === 'string' && value.includes('\n')) { + // Multi-line strings → literal block + // outputValue = `${key}: |\n` + value.split('\n').forEach(line => { + outputValue += ` ${line}\n` + }) } else { // single string value const escapedValue = (value ?? '').replace(/"/g, '\\"'); From cf65f42df2628b42dfec80c57416ee48ae01775d Mon Sep 17 00:00:00 2001 From: k8n Date: Mon, 9 Jun 2025 23:02:02 -0400 Subject: [PATCH 4/6] A hack to extract and append named WP post meta values to content --- README.md | 15 +++++++++++++++ src/parser.js | 11 +++++++++++ src/questions.js | 6 ++++++ src/writer.js | 6 ++++++ 4 files changed, 38 insertions(+) diff --git a/README.md b/README.md index e087679a..c2ab7c29 100644 --- a/README.md +++ b/README.md @@ -153,6 +153,21 @@ Allowed values: Comma separated list of the WP post meta values to include in the frontmatter of Markdown files. Serialized PHP arrays get unserialized and converted to corresponding YAML structures. +### Append WP post meta to Content + +``` +--append-meta=staff_sidebar:sidebar +``` + +Extract listed WP post meta and append it to content using MDC component syntax. + +e.g. +``` +::sidebar +WP meta content from 'staff_sidebar' post meta key converted to markdown +:: +``` + ### Specific content types ``` diff --git a/src/parser.js b/src/parser.js index 6c46a3ba..e9ecab01 100644 --- a/src/parser.js +++ b/src/parser.js @@ -117,6 +117,17 @@ function buildPost(data) { // these are possibly set later in mergeImagesIntoPosts() coverImage: undefined, imageUrls: [], + + metaContent: Object.fromEntries( + shared.config.appendMeta.map((field) => { + const [key, alias] = field.split(':'); + const value = getPostMetaValue(data, key); + if (value !== undefined && value !== null && value !== '') { + // treat the value + return [alias ?? key, translator.getPostContent(value)]; + } + }) + ) }; } diff --git a/src/questions.js b/src/questions.js index 2fe341d2..2d0f8bc3 100644 --- a/src/questions.js +++ b/src/questions.js @@ -119,6 +119,12 @@ export function load() { // e.g. 'rank_math_seo_score,rank_math_contentai_score' default: [] }, + { + name: 'append-meta', + type: 'list', + description: 'Meta fields to apend to the content as MDC components', + default: [] + }, { name: 'post-types', type: 'list', diff --git a/src/writer.js b/src/writer.js index a9e75066..e111bb20 100644 --- a/src/writer.js +++ b/src/writer.js @@ -122,6 +122,12 @@ async function loadMarkdownFilePromise(post) { }); output += `---\n\n${post.content}\n`; + + // for each post.metaContent object attribute, append to output + Object.entries(post.metaContent).forEach(([key, value]) => { + output += `\n\n::${key}\n${value}\n::\n`; + }); + return output; } From 78746348ce8956dc71b5354eff70b2a0211f3e3c Mon Sep 17 00:00:00 2001 From: k8n Date: Mon, 9 Jun 2025 23:23:16 -0400 Subject: [PATCH 5/6] An option to strip shortcodes before converting to markdown --- README.md | 12 ++++++++++++ src/parser.js | 2 +- src/questions.js | 6 ++++++ src/translator.js | 33 ++++++++++++++++++++++++++++++++- 4 files changed, 51 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index c2ab7c29..8fd34095 100644 --- a/README.md +++ b/README.md @@ -188,6 +188,18 @@ Allowed values: Comma separated list of the content types to exclude from Markdown files. Leave empty to include all default content types. +### Strip shortcodes + +``` +--strip-shortcodes=true +``` + +Strip shortcodes from content converting the content therein into simple
tags. + +Allowed values: + +- `true` or `false`. + ### Delay between image file requests? ``` diff --git a/src/parser.js b/src/parser.js index e9ecab01..40489c18 100644 --- a/src/parser.js +++ b/src/parser.js @@ -126,7 +126,7 @@ function buildPost(data) { // treat the value return [alias ?? key, translator.getPostContent(value)]; } - }) + }).filter(x => x) ) }; } diff --git a/src/questions.js b/src/questions.js index 2d0f8bc3..de031004 100644 --- a/src/questions.js +++ b/src/questions.js @@ -125,6 +125,12 @@ export function load() { description: 'Meta fields to apend to the content as MDC components', default: [] }, + { + name: 'strip-shortcodes', + type: 'boolean', + description: 'Strip shortcodes from content', + default: false + }, { name: 'post-types', type: 'list', diff --git a/src/translator.js b/src/translator.js index 8db1246a..b590a576 100644 --- a/src/translator.js +++ b/src/translator.js @@ -108,8 +108,39 @@ function initTurndownService() { return turndownService; } +/** + * Convert any WordPress shortcode into a
wrapper, + * strip out self-closing shortcodes entirely. + */ +function simplifyAllShortcodes(content) { + // 1) Capture: + // 1st group = the tag name ([foo] or [/foo]) + // 2nd group = any attributes (optional) + // 3rd group = a trailing slash (if self-closing) + const pattern = /\[\/?([A-Za-z0-9_]+)([^\]]*?)(\/)?\]/g; + + return content.replace(pattern, (fullMatch, tagName, attrs, selfClosing) => { + // 2) If it’s self-closing (matched that 3rd capture), drop it + if (selfClosing) { + return ''; + } + + // 3) Otherwise decide opening vs closing: + const isClosing = fullMatch.startsWith('[/' + tagName); + + // 4) Replace with
or
+ return isClosing + ? `
` + : `
`; + }); +} + export function getPostContent(content) { - // insert an empty div element between double line breaks + if (shared.config.stripShortcodes) { + content = simplifyAllShortcodes(content); + } + + // insert an empty div element between double line breaks // this nifty trick causes turndown to keep adjacent paragraphs separated // without mucking up content inside of other elements (like blocks) content = content.replace(/(\r?\n){2}/g, '\n
\n'); From 6a2ad9420adac5c3b681d7a8c8697f3799bcc472 Mon Sep 17 00:00:00 2001 From: k8n Date: Tue, 24 Jun 2025 17:17:32 -0400 Subject: [PATCH 6/6] Add Polylang support and nested frontmatter meta Introduces Polylang translation set processing, including grouping posts by translation sets and writing language-specific Markdown files. Adds support for nested frontmatter meta using dotted notation, category-based filtering, and new CLI options for Polylang and category inclusion/exclusion. Updates documentation and refactors file writing logic to handle translation groups and images accordingly. --- .prettierrc.json | 4 + README.md | 34 +++++- src/frontmatter.js | 24 ++++ src/parser.js | 204 ++++++++++++++++++++++++++----- src/questions.js | 25 ++++ src/writer.js | 296 ++++++++++++++++++++++++++++++++++++--------- 6 files changed, 497 insertions(+), 90 deletions(-) create mode 100644 .prettierrc.json diff --git a/.prettierrc.json b/.prettierrc.json new file mode 100644 index 00000000..7ac14558 --- /dev/null +++ b/.prettierrc.json @@ -0,0 +1,4 @@ + { + "useTabs": true, + "trailingComma": "none" + } diff --git a/README.md b/README.md index 8fd34095..a6ccab38 100644 --- a/README.md +++ b/README.md @@ -148,10 +148,19 @@ Allowed values: ### Frontmatter meta ``` ---frontmatter-meta=rank_math_seo_score:seo_score,rank_math_contentai_score +--frontmatter-meta=rank_math_seo_score:seo.score,title:seo.title,rank_math_contentai_score ``` -Comma separated list of the WP post meta values to include in the frontmatter of Markdown files. Serialized PHP arrays get unserialized and converted to corresponding YAML structures. +Comma separated list of the WP post meta values to include in the frontmatter of Markdown files. Serialized PHP arrays get unserialized and converted to corresponding YAML structures. Dotted notation for nested frontmatter placement is supported with the example above reuslting in the following output: + +```yaml +--- +seo: + score: 90 + title: SEO Title +rank_math_contentai_score: 85 +--- +``` ### Append WP post meta to Content @@ -188,6 +197,15 @@ Allowed values: Comma separated list of the content types to exclude from Markdown files. Leave empty to include all default content types. +### Specific categories + +``` +--include-categories=news,resources +--exclude-categories=updates +``` + +Include or exclude content from specific categories based on those slugs. + ### Strip shortcodes ``` @@ -200,6 +218,18 @@ Allowed values: - `true` or `false`. +### Polylang translation sets support + +``` +--polylang=true +``` + +Include translations of posts in Markdown files. Translation sets will be produced with locale suffixes. E.g. `index.en.md` and `index.fr.md` for English- and French-language versions of the content. + +Allowed values: + +- `true` or `false`. + ### Delay between image file requests? ``` diff --git a/src/frontmatter.js b/src/frontmatter.js index 22f37c94..1f0c3f39 100644 --- a/src/frontmatter.js +++ b/src/frontmatter.js @@ -22,6 +22,11 @@ export function date(post) { return post.date; } +export function status(post) { + // status of the post, previously parsed and decoded + return post.data.childValue('status'); +} + export function draft(post) { // boolean representing the previously parsed draft status, only included when true return post.isDraft ? true : undefined; @@ -34,6 +39,11 @@ export function excerpt(post) { return encoded ? encoded.replace(/[\r\n]+/gm, ' ') : undefined; } +export function language(post) { + // language code, previously parsed and decoded + return post.polylang?.language || shared.config.polylangDefaultLanguage; +} + export function id(post) { // previously parsed as a string, converted to integer here return parseInt(post.id); @@ -44,6 +54,20 @@ export function slug(post) { return post.slug; } +export function link(post) { + // previously parsed and decoded + if (post.link) { + try { + const url = new URL(post.link); + return url.pathname; // Extracts the path portion of the URL + } catch (error) { + // If post.link is not a valid URL, return it as is + return post.link; + } + } + return post.link; +} + export function tags(post) { // array of decoded tag names (yes, they come from nodes, not a typo) const categories = post.data.children('category'); diff --git a/src/parser.js b/src/parser.js index 40489c18..13a1c062 100644 --- a/src/parser.js +++ b/src/parser.js @@ -8,39 +8,113 @@ import * as translator from './translator.js'; import { unserialize } from 'php-serialize' export async function parseFilePromise() { - shared.logHeading('Parsing'); - const content = await fs.promises.readFile(shared.config.input, 'utf8'); - const rssData = await data.load(content); - const allPostData = rssData.child('channel').children('item'); + shared.logHeading('Parsing'); - let postTypes = getPostTypes(allPostData); + // 1) Load and parse the XML + const content = await fs.promises.readFile(shared.config.input, 'utf8'); + const rssData = await data.load(content); + const allPostData = rssData.child('channel').children('item'); - if (shared.config.postTypes?.length) { - postTypes = postTypes.filter((postType) => - shared.config.postTypes.includes(postType) - ); - } + // 2) Determine which post types to include/exclude + let postTypes = getPostTypes(allPostData); + if (shared.config.postTypes?.length) { + postTypes = postTypes.filter(pt => shared.config.postTypes.includes(pt)); + } + if (shared.config.excludePostTypes?.length) { + postTypes = postTypes.filter(pt => !shared.config.excludePostTypes.includes(pt)); + } - if (shared.config.excludePostTypes?.length) { - postTypes = postTypes.filter((postType) => - ! shared.config.postTypes.includes(postType) - ); - } + // 3) Collect the basic posts + let posts = collectPosts(allPostData, postTypes); - const posts = collectPosts(allPostData, postTypes); + // 3) category‐based filtering + if (shared.config.includeCategories.length) { + posts = posts.filter(post => + frontmatter.categories(post)?.some(cat => + shared.config.includeCategories.includes(cat) + ) + ); + } + if (shared.config.excludeCategories.length) { + posts = posts.filter(post => + !frontmatter.categories(post)?.some(cat => + shared.config.excludeCategories.includes(cat) + ) + ); + } - const images = []; - if (shared.config.saveImages === 'attached' || shared.config.saveImages === 'all') { - images.push(...collectAttachedImages(allPostData)); - } - if (shared.config.saveImages === 'scraped' || shared.config.saveImages === 'all') { - images.push(...collectScrapedImages(allPostData, postTypes)); - } + // 4) Collect images exactly as before + const images = []; + if (shared.config.saveImages === 'attached' || shared.config.saveImages === 'all') { + images.push(...collectAttachedImages(allPostData)); + } + if (shared.config.saveImages === 'scraped' || shared.config.saveImages === 'all') { + images.push(...collectScrapedImages(allPostData, postTypes)); + } + mergeImagesIntoPosts(images, posts); + + // 5) Build a lookup by post ID so we can enrich posts in place + const postById = Object.fromEntries(posts.map(p => [String(p.id), p])); + + // 6) Parse all entries (namespace stripped) for post_translations groups + const termMappings = {}; + const termNodes = rssData.child('channel').children('term') || []; + for (const term of termNodes) { + const taxonomy = term.childValue('term_taxonomy'); + if (taxonomy !== 'post_translations') continue; - mergeImagesIntoPosts(images, posts); - populateFrontmatter(posts); + const slug = term.childValue('term_slug'); + const rawDesc = term.childValue('term_description') || ''; + + try { + const parsed = unserialize(rawDesc); + // Normalize IDs to strings + termMappings[slug] = Object.fromEntries( + Object.entries(parsed).map(([lang, id]) => [lang, String(id)]) + ); + } catch (err) { + console.warn(`⚠️ Could not parse term_description for ${slug}`, err); + } + } - return posts; + // 7) Walk each again to pull out Polylang categories + for (const item of allPostData) { // use just 'posts', pre-filtered? + const id = item.childValue('post_id'); + const post = postById[id]; + if (!post) continue; + + // Initialize polylang container + post.polylang = { + language: null, + groupSlug: null, + translationMap: null + }; + + // Read all tags on this item (xml2js puts them in item.category[]) + const cats = item.children('category') || []; + for (const cat of cats) { + // xml2js stores attributes under `.$` + const domain = cat.attribute('domain'); + const nicename = cat.attribute('nicename'); + if (domain === 'language') { + post.polylang.language = nicename; + } + if (domain === 'post_translations') { + post.polylang.groupSlug = nicename; + } + } + + // Attach the full translationMap if we have one + const gs = post.polylang.groupSlug; + if (gs && termMappings[gs]) { + post.polylang.translationMap = termMappings[gs]; + } + } + + // 8) Finally, build frontmatter (and any other per-post enrichment) + populateFrontmatter(posts); + + return posts; } function getPostTypes(allPostData) { @@ -98,6 +172,49 @@ function collectPosts(allPostData, postTypes) { return allPosts; } +/** + * allPosts: an array of post objects, each with: + * - post.id (string or number) + * - post.slug (string) + * - post.language (string or null) + * - post._pllTranslations (object or null) + * + * Returns: an object whose keys are groupKey (string), + * and whose values are arrays of post objects in that group. + */ + export function buildTranslationGroups(allPosts) { + const groups = {}; + + for (const post of allPosts) { + let key; + + const tm = post.polylang.translationMap; + if (tm && Object.keys(tm).length > 0) { + // tm values are post IDs as strings + const ids = Object.values(tm).slice().sort(); + key = ids.join(','); + } else { + key = String(post.id); + } + + if (!groups[key]) groups[key] = []; + groups[key].push(post); + } + + return groups; + } + +export function chooseBaseSlug(postsInGroup, defaultLangCode) { + // postsInGroup is an array of post objects, each with post.slug and post.language. + // 1. Try to find the post whose post.language === defaultLangCode: + let candidate = postsInGroup.find(p => p.language === defaultLangCode); + if (candidate) { + return candidate.slug; + } + // 2. If none matched (rare if the group didn’t contain the default), pick the first post’s slug: + return postsInGroup[0].slug; +} + function buildPost(data) { return { // full raw post data @@ -109,6 +226,8 @@ function buildPost(data) { // particularly useful values for all sorts of things type: data.childValue('post_type'), id: data.childValue('post_id'), + link: data.childValue('link'), + isPublished: data.childValue('status') === 'publish', isDraft: data.childValue('status') === 'draft', slug: decodeURIComponent(data.childValue('post_name')), date: getPostDate(data), @@ -231,6 +350,27 @@ function mergeImagesIntoPosts(images, posts) { }); } +/** + * Deep-sets `obj[path[0]][path[1]]… = value`, creating intermediate + * objects if they don’t yet exist. + * + * @param {object} obj The object to modify + * @param {string[]} path Array of keys, e.g. ['seo','title'] + * @param {*} value The value to assign + */ +function setNested(obj, path, value) { + let cur = obj; + for (let i = 0; i < path.length - 1; i++) { + const key = path[i]; + if (cur[key] == null || typeof cur[key] !== 'object') { + cur[key] = {}; + } + cur = cur[key]; + } + // final segment + cur[path[path.length - 1]] = value; +} + function populateFrontmatter(posts) { posts.forEach((post) => { post.frontmatter = {}; @@ -248,10 +388,16 @@ function populateFrontmatter(posts) { // Handling for meta fields shared.config.frontmatterMeta.forEach((field) => { - const [key, alias] = field.split(':'); - const value = getPostMetaValue(post.data, key); + // split “metaKey:alias.path” or just “metaKey” + const [metaKey, rawAlias] = field.split(':').map(s => s.trim()); + const alias = rawAlias || metaKey; + + const value = getPostMetaValue(post.data, metaKey); if (value !== undefined && value !== null && value !== '') { - post.frontmatter[alias ?? key] = value; + // build the path segments for nested assignment: + const pathSegments = alias.split('.'); + // deep‐assign into post.frontmatter + setNested(post.frontmatter, pathSegments, value); } }); }); diff --git a/src/questions.js b/src/questions.js index de031004..76e2e4f7 100644 --- a/src/questions.js +++ b/src/questions.js @@ -130,6 +130,18 @@ export function load() { type: 'boolean', description: 'Strip shortcodes from content', default: false + }, + { + name: 'polylang', + type: 'boolean', + description: 'Process Polylang translation sets', + default: false + }, + { + name: 'polylang-default-language', + type: 'string', + description: 'Polylang translation set to process', + default: 'en' }, { name: 'post-types', @@ -143,6 +155,19 @@ export function load() { description: 'Post types to convert (empty for all)', default: [] }, + { + name: 'include-categories', + type: 'list', + description: 'Only export posts in these category slugs (domain="category")', + // e.g. 'news,resources,updates' + default: [], + }, + { + name: 'exclude-categories', + type: 'list', + description: 'Exclude posts in these categories', + default: [], + }, { name: 'request-delay', type: 'integer', diff --git a/src/writer.js b/src/writer.js index e111bb20..c04df6c7 100644 --- a/src/writer.js +++ b/src/writer.js @@ -7,9 +7,28 @@ import * as luxon from 'luxon'; import path from 'path'; import * as shared from './shared.js'; +import { buildTranslationGroups, chooseBaseSlug } from './parser.js'; + export async function writeFilesPromise(posts) { - await writeMarkdownFilesPromise(posts); - await writeImageFilesPromise(posts); + + let groupMap = {}; + + // Polylang translation sets need a different approach + if (shared.config.polylang) { + // Build the “groupMap” only if Polylang mode is on, otherwise treat each post as its own group + console.log('Building Polylang translation groups...') + groupMap = buildTranslationGroups(posts) + } else { + // If not using Polylang, place each post in its own group so we can reuse the same loop later + for (const post of posts) { + groupMap[String(post.id)] = [post]; + } + } + + console.log(Object.keys(groupMap).filter(key => key.length > 5)) + + await writeMarkdownFilesPromise(groupMap); + await writeImageFilesPromise(groupMap); } async function processPayloadsPromise(payloads, loadFunc) { @@ -41,33 +60,152 @@ async function writeFile(destinationPath, data) { await fs.promises.writeFile(destinationPath, data); } -async function writeMarkdownFilesPromise(posts) { - // package up posts into payloads - let existingCount = 0; - let delay = 0; - const payloads = posts.flatMap((post) => { - const destinationPath = shared.buildPostPath(post); - if (checkFile(destinationPath)) { - // already exists, don't need to save again - existingCount++; - return []; - } else { - const payload = { - item: post, - type: post.type, - name: shared.getSlugWithFallback(post), - destinationPath, - delay - }; - delay += shared.config.writeDelay; - return [payload]; - } - }); +/** + * @param {Object.} groupMap + * A map of groupKey → array of posts in that translation group + */ +async function writeMarkdownFilesPromise_v1(groupMap) { + let existingCount = 0; + let delay = 0; + const payloads = Object.entries(groupMap).flatMap(([groupKey, postsInGroup]) => { + // —— Polylang mode: multiple translations per group —— + if (shared.config.polylang && postsInGroup.length > 1) { + const baseSlug = chooseBaseSlug(postsInGroup, shared.config.defaultLanguage); - logSavingMessage('posts', existingCount, payloads.length); - if (payloads.length > 0) { - await processPayloadsPromise(payloads, loadMarkdownFilePromise); - } + // Prepare per‐group folder if needed + const groupFolder = shared.config.postFolders + ? path.join(shared.config.output, baseSlug) + : shared.config.output; + if (shared.config.postFolders) { + fs.mkdirSync(groupFolder, { recursive: true }); + } + + return postsInGroup.flatMap((post) => { + // e.g. "my-post.en.md" + const lang = post.polylang.language || 'und'; + const fileName = shared.config.postFolders + ? `index.${lang}.md` + : `${baseSlug}.${lang}.md`; + const destinationPath = shared.config.postFolders + ? path.join(groupFolder, fileName) + : path.join(shared.config.output, fileName); + + if (fs.existsSync(destinationPath)) { + existingCount++; + return []; + } + + const payload = { + item: post, + type: post.type, + name: fileName, + destinationPath, + delay + }; + delay += shared.config.writeDelay; + return [payload]; + }); + } + + // —— Fallback: single‐post groups (or polylang=false) —— + return postsInGroup.flatMap((post) => { + const destinationPath = shared.buildPostPath(post); + if (fs.existsSync(destinationPath)) { + existingCount++; + return []; + } + const payload = { + item: post, + type: post.type, + name: shared.getSlugWithFallback(post), + destinationPath, + delay + }; + delay += shared.config.writeDelay; + return [payload]; + }); + }); + + logSavingMessage('posts', existingCount, payloads.length); + if (payloads.length > 0) { + await processPayloadsPromise(payloads, loadMarkdownFilePromise); + } +} + +/** + * @param {Object.} groupMap + * A map where each key is a translation-group identifier (or a single-post ID) + * and each value is an array of post objects in that group. + */ +async function writeMarkdownFilesPromise(groupMap) { + let existingCount = 0; + let delay = 0; + const payloads = []; + + for (const [groupKey, postsInGroup] of Object.entries(groupMap)) { + // —— Polylang: multiple translations in one group —— + if (shared.config.polylang && postsInGroup.length > 1) { + // 1) Pick the canonical base slug for the group: + const baseSlug = chooseBaseSlug(postsInGroup, shared.config.defaultLanguage); + + // 2) “Fake” a base post so buildPostPath() gives us the right folder path: + const fakeBasePost = { ...postsInGroup[0], slug: baseSlug }; + const basePath = shared.buildPostPath(fakeBasePost); + const groupFolder = shared.config.postFolders + ? path.dirname(basePath) + : null; + + // 3) For each translation, compute its destinationPath: + for (const post of postsInGroup) { + const lang = post.polylang.language || 'und'; + let destinationPath; + + if (shared.config.postFolders) { + // inside the group folder, name it "index..md" + destinationPath = path.join(groupFolder, `index.${lang}.md`); + } else { + // no per-post folders: fake a slug with the language appended + const fakeLangPost = { ...post, slug: `${baseSlug}.${lang}` }; + destinationPath = shared.buildPostPath(fakeLangPost); + } + + if (fs.existsSync(destinationPath)) { + existingCount++; + } else { + payloads.push({ + item: post, + type: post.type, + destinationPath, + delay + }); + delay += shared.config.writeDelay; + } + } + + } else { + // —— Single-post (no translations) or Polylang disabled —— + for (const post of postsInGroup) { + const destinationPath = shared.buildPostPath(post); + if (fs.existsSync(destinationPath)) { + existingCount++; + } else { + payloads.push({ + item: post, + type: post.type, + destinationPath, + delay + }); + delay += shared.config.writeDelay; + } + } + } + } + + // 4) Kick off the actual file writes + logSavingMessage('posts', existingCount, payloads.length); + if (payloads.length > 0) { + await processPayloadsPromise(payloads, loadMarkdownFilePromise); + } } async function loadMarkdownFilePromise(post) { @@ -131,38 +269,78 @@ async function loadMarkdownFilePromise(post) { return output; } -async function writeImageFilesPromise(posts) { - // collect image data from all posts into a single flattened array of payloads - let existingCount = 0; - let delay = 0; - const payloads = posts.flatMap((post) => { - const postPath = shared.buildPostPath(post); - const imagesDir = path.join(path.dirname(postPath), 'images'); - return post.imageUrls.flatMap((imageUrl) => { - const filename = shared.getFilenameFromUrl(imageUrl); - const destinationPath = path.join(imagesDir, filename); - if (checkFile(destinationPath)) { - // already exists, don't need to save again - existingCount++; - return []; - } else { - const payload = { - item: imageUrl, - type: 'image', - name: filename, - destinationPath, - delay - }; - delay += shared.config.requestDelay; - return [payload]; - } - }); - }); +/** + * @param {Object.} groupMap + * A map of groupKey → array of posts in that translation group + */ +async function writeImageFilesPromise(groupMap) { + let existingCount = 0; + let delay = 0; + const payloads = []; - logSavingMessage('images', existingCount, payloads.length); - if (payloads.length > 0) { - await processPayloadsPromise(payloads, loadImageFilePromise); - } + for (const [groupKey, postsInGroup] of Object.entries(groupMap)) { + // Determine the images directory for this group + let imagesDir; + + if (shared.config.polylang && postsInGroup.length > 1) { + // Polylang group: use the group’s base slug folder + const baseSlug = chooseBaseSlug(postsInGroup, shared.config.defaultLanguage); + const groupFolder = shared.config.postFolders + ? path.join(shared.config.output, baseSlug) + : shared.config.output; + imagesDir = path.join(groupFolder, 'images'); + } else { + // Single-post or non-Polylang: images next to that post’s path + for (const post of postsInGroup) { + const postPath = shared.buildPostPath(post); + const dir = path.join(path.dirname(postPath), 'images'); + // collect images for this one post + for (const imageUrl of post.imageUrls) { + const filename = shared.getFilenameFromUrl(imageUrl); + const destinationPath = path.join(dir, filename); + if (checkFile(destinationPath)) { + existingCount++; + continue; + } + payloads.push({ + item: imageUrl, + type: 'image', + name: filename, + destinationPath, + delay + }); + delay += shared.config.requestDelay; + } + } + // skip ahead to next group + continue; + } + + // For a Polylang group, gather images from _all_ translations into one folder + for (const post of postsInGroup) { + for (const imageUrl of post.imageUrls) { + const filename = shared.getFilenameFromUrl(imageUrl); + const destinationPath = path.join(imagesDir, filename); + if (checkFile(destinationPath)) { + existingCount++; + continue; + } + payloads.push({ + item: imageUrl, + type: 'image', + name: filename, + destinationPath, + delay + }); + delay += shared.config.requestDelay; + } + } + } + + logSavingMessage('images', existingCount, payloads.length); + if (payloads.length > 0) { + await processPayloadsPromise(payloads, loadImageFilePromise); + } } async function loadImageFilePromise(imageUrl) {