Skip to content

Commit

Permalink
chore: collect no subpages resoureces
Browse files Browse the repository at this point in the history
  • Loading branch information
appflowy committed Nov 2, 2024
1 parent 4175465 commit 64e8a7d
Show file tree
Hide file tree
Showing 3 changed files with 87 additions and 40 deletions.
26 changes: 25 additions & 1 deletion collab-importer/src/notion/importer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -90,14 +90,20 @@ impl NotionImporter {
.await
.unwrap_or_default();

let no_subpages = !has_subdirectories(&self.path, 1);
let notion_export = NotionExportContext {
csv_relation,
no_subpages,
};

let path = self.path.clone();
let host = self.host.clone();
let workspace_id = self.workspace_id.clone();
let pages = tokio::task::spawn_blocking(move || {
// Process entries and track whether we have spaces (directories) and pages (non-directories)
let mut notion_pages: Vec<NotionPage> = vec![];
for entry in walk_sub_dir(&path) {
if let Some(view) = process_entry(&host, &workspace_id, &entry, false, &csv_relation) {
if let Some(view) = process_entry(&host, &workspace_id, &entry, false, &notion_export) {
has_spaces |= view.is_dir;
has_pages |= !view.is_dir;
notion_pages.push(view);
Expand Down Expand Up @@ -309,6 +315,15 @@ async fn convert_notion_page_to_parent_child(
view_builder.build()
}

pub struct NotionExportContext {
pub csv_relation: CSVRelation,
pub no_subpages: bool,
}

/// [CSVRelation] manages parent-child relationships between CSV files exported in zip format from Notion.
/// The zip export may contain multiple CSV files that represent views of the main *_all.csv file.
/// When a partial CSV file is encountered, it is replaced with the main *_all.csv file and directed to
/// reference the *_all.csv file using the specified ID.
#[derive(Default, Debug, Clone)]
pub struct CSVRelation {
inner: Arc<HashMap<String, PathBuf>>,
Expand Down Expand Up @@ -481,6 +496,15 @@ fn extract_file_name(input: &str) -> String {

normalized
}

fn has_subdirectories(path: &PathBuf, max_depth: usize) -> bool {
WalkDir::new(path)
.max_depth(max_depth)
.into_iter()
.filter_map(Result::ok)
.any(|entry| entry.file_type().is_dir() && entry.path() != path)
}

#[cfg(test)]
mod test_csv_relation {
use super::*;
Expand Down
87 changes: 48 additions & 39 deletions collab-importer/src/notion/walk_dir.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ use percent_encoding::percent_decode_str;

use crate::notion::file::{process_row_md_content, NotionFile, Resource};
use crate::notion::page::{ExternalLink, ExternalLinkType, ImportedRowDocument, NotionPage};
use crate::notion::CSVRelation;
use crate::notion::NotionExportContext;
use crate::util::parse_csv;

use std::fs;
Expand All @@ -22,7 +22,6 @@ pub(crate) fn get_file_size(path: &PathBuf) -> std::io::Result<u64> {
}

pub(crate) fn collect_entry_resources(
_workspace_id: &str,
walk_path: &Path,
relative_path: Option<&Path>,
) -> Vec<Resource> {
Expand Down Expand Up @@ -80,7 +79,7 @@ pub(crate) fn process_entry(
workspace_id: &str,
current_entry: &DirEntry,
include_partial_csv: bool,
csv_relation: &CSVRelation,
notion_export: &NotionExportContext,
) -> Option<NotionPage> {
// Skip macOS-specific files
let entry_name = current_entry.file_name().to_str()?;
Expand All @@ -92,7 +91,7 @@ pub(crate) fn process_entry(
let ext = get_file_extension(path, include_partial_csv);
if ext.is_file() {
// Check if there's a corresponding directory for this .md file and skip it if so
process_file(host, workspace_id, path, ext, csv_relation)
process_file(host, workspace_id, path, ext, notion_export)
} else if path.is_dir() {
// If the path is a directory, it should contain a file with the same name but with either a .md or .csv extension.
// If no such file is found, the directory will be treated as a space.
Expand All @@ -113,7 +112,7 @@ pub(crate) fn process_entry(
id,
&md_file_path,
include_partial_csv,
csv_relation,
notion_export,
)
} else if all_csv_file_path.exists() {
process_csv_dir(
Expand All @@ -125,10 +124,10 @@ pub(crate) fn process_entry(
parent_path,
&all_csv_file_path,
&csv_file_path,
csv_relation,
notion_export,
)
} else {
process_space_dir(host, workspace_id, name, id, path, csv_relation)
process_space_dir(host, workspace_id, name, id, path, notion_export)
}
} else {
None
Expand All @@ -141,13 +140,13 @@ fn process_space_dir(
name: String,
id: Option<String>,
path: &Path,
csv_relation: &CSVRelation,
notion_export: &NotionExportContext,
) -> Option<NotionPage> {
let mut children = vec![];
// Collect all child entries first, to sort by created time
let entries: Vec<_> = walk_sub_dir(path);
for sub_entry in entries {
if let Some(child_view) = process_entry(host, workspace_id, &sub_entry, false, csv_relation) {
if let Some(child_view) = process_entry(host, workspace_id, &sub_entry, false, notion_export) {
children.push(child_view);
}
}
Expand All @@ -162,7 +161,7 @@ fn process_space_dir(
host: host.to_string(),
workspace_id: workspace_id.to_string(),
is_dir: true,
csv_relation: csv_relation.clone(),
csv_relation: notion_export.csv_relation.clone(),
})
}

Expand All @@ -176,7 +175,7 @@ fn process_csv_dir(
parent_path: &Path,
all_csv_file_path: &PathBuf,
csv_file_path: &PathBuf,
csv_relation: &CSVRelation,
notion_export: &NotionExportContext,
) -> Option<NotionPage> {
let mut resources = vec![];
let file_size = get_file_size(all_csv_file_path).ok()?;
Expand All @@ -185,7 +184,7 @@ fn process_csv_dir(
// To identify which CSV file contains these resources, we must check each row
// to see if any paths match the resource path.
// Currently, we do this in [filter_out_resources].
resources.extend(collect_entry_resources(workspace_id, parent_path, None));
resources.extend(collect_entry_resources(parent_path, None));
let mut row_documents = vec![];

// collect all sub entries whose entries are directory
Expand All @@ -194,7 +193,7 @@ fn process_csv_dir(
let csv_dir = parent_path.join(file_name);
if csv_dir.exists() {
for sub_entry in walk_sub_dir(&csv_dir) {
if let Some(mut page) = process_entry(host, workspace_id, &sub_entry, true, csv_relation) {
if let Some(mut page) = process_entry(host, workspace_id, &sub_entry, true, notion_export) {
if page.children.iter().any(|c| c.notion_file.is_markdown()) {
warn!("Only CSV file exist in the database row directory");
}
Expand All @@ -216,7 +215,10 @@ fn process_csv_dir(
page.children.retain(|child| {
if let Some(file_path) = child.notion_file.file_path() {
if let Ok(file_name) = file_name_from_path(file_path) {
return csv_relation.get(&file_name.to_lowercase()).is_none();
return notion_export
.csv_relation
.get(&file_name.to_lowercase())
.is_none();
}
}
true
Expand Down Expand Up @@ -260,10 +262,12 @@ fn process_csv_dir(
host: host.to_string(),
workspace_id: workspace_id.to_string(),
is_dir: false,
csv_relation: csv_relation.clone(),
csv_relation: notion_export.csv_relation.clone(),
};

csv_relation.set_page_by_path_buf(all_csv_file_path.clone(), page.clone());
notion_export
.csv_relation
.set_page_by_path_buf(all_csv_file_path.clone(), page.clone());
Some(page)
}

Expand All @@ -286,7 +290,7 @@ fn process_md_dir(
id: Option<String>,
md_file_path: &PathBuf,
include_partial_csv: bool,
csv_relation: &CSVRelation,
notion_export: &NotionExportContext,
) -> Option<NotionPage> {
let mut children = vec![];
let external_links = get_md_links(md_file_path).unwrap_or_default();
Expand All @@ -300,18 +304,14 @@ fn process_md_dir(
workspace_id,
&sub_entry,
include_partial_csv,
csv_relation,
notion_export,
) {
children.push(child_view);
}

// When traversing the directory, resources like images and files
// can be found within subdirectories of the current directory.
resources.extend(collect_entry_resources(
workspace_id,
sub_entry.path(),
None,
));
resources.extend(collect_entry_resources(sub_entry.path(), None));
}
}

Expand All @@ -331,7 +331,7 @@ fn process_md_dir(
host: host.to_string(),
workspace_id: workspace_id.to_string(),
is_dir: false,
csv_relation: csv_relation.clone(),
csv_relation: notion_export.csv_relation.clone(),
})
}

Expand All @@ -340,14 +340,14 @@ fn process_file(
workspace_id: &str,
path: &Path,
ext: FileExtension,
csv_relation: &CSVRelation,
notion_export: &NotionExportContext,
) -> Option<NotionPage> {
match ext {
FileExtension::Unknown => None,
FileExtension::Markdown => process_md_file(host, workspace_id, path, csv_relation),
FileExtension::Markdown => process_md_file(host, workspace_id, path, notion_export),
FileExtension::Csv {
include_partial_csv,
} => process_csv_file(host, workspace_id, path, include_partial_csv, csv_relation),
} => process_csv_file(host, workspace_id, path, include_partial_csv, notion_export),
}
}

Expand All @@ -356,7 +356,7 @@ fn process_csv_file(
workspace_id: &str,
path: &Path,
include_partial_csv: bool,
csv_relation: &CSVRelation,
notion_export: &NotionExportContext,
) -> Option<NotionPage> {
let file_name = path.file_name()?.to_str()?;
// Check if a folder exists with the same name as the CSV file, excluding the "_all.csv" suffix.
Expand Down Expand Up @@ -384,7 +384,7 @@ fn process_csv_file(
// to see if any paths match the resource path.
// Currently, we do this in [filter_out_resources].
if let Some(parent) = path.parent() {
resources.extend(collect_entry_resources(workspace_id, parent, None));
resources.extend(collect_entry_resources(parent, None));
}

let file_path = path.to_path_buf();
Expand All @@ -407,15 +407,15 @@ fn process_csv_file(
host: host.to_string(),
workspace_id: workspace_id.to_string(),
is_dir: false,
csv_relation: csv_relation.clone(),
csv_relation: notion_export.csv_relation.clone(),
})
}

fn process_md_file(
host: &str,
workspace_id: &str,
path: &Path,
csv_relation: &CSVRelation,
notion_export: &NotionExportContext,
) -> Option<NotionPage> {
if let Some(parent) = path.parent() {
let file_stem = path.file_stem()?.to_str()?;
Expand All @@ -427,7 +427,7 @@ fn process_md_file(

// Process the file normally if it doesn't correspond to a directory
let (name, id) = name_and_id_from_path(path).ok()?;
let notion_file = file_type_from_path(path)?;
let notion_file = notino_file_from_path(path, notion_export.no_subpages)?;
let mut external_links = vec![];
if notion_file.is_markdown() {
external_links = get_md_links(path).unwrap_or_default();
Expand All @@ -447,7 +447,7 @@ fn process_md_file(
host: host.to_string(),
workspace_id: workspace_id.to_string(),
is_dir: false,
csv_relation: csv_relation.clone(),
csv_relation: notion_export.csv_relation.clone(),
})
}

Expand Down Expand Up @@ -590,16 +590,25 @@ fn name_and_id_from_path(path: &Path) -> Result<(String, Option<String>), Import
/// - If the file is a `.csv` and contains `_all`, it's considered a `CSV`.
/// - Otherwise, if it's a `.csv`, it's considered a `CSVPart`.
/// - `.md` files are classified as `Markdown`.
fn file_type_from_path(path: &Path) -> Option<NotionFile> {
fn notino_file_from_path(path: &Path, no_subpages: bool) -> Option<NotionFile> {
let extension = path.extension()?.to_str()?;
let file_size = get_file_size(&path.to_path_buf()).ok()?;

match extension {
"md" => Some(NotionFile::Markdown {
file_path: path.to_path_buf(),
size: file_size,
resources: vec![],
}),
"md" => {
let mut resources = vec![];
if no_subpages {
if let Some(parent_path) = path.parent() {
resources = collect_entry_resources(parent_path, None);
}
}

Some(NotionFile::Markdown {
file_path: path.to_path_buf(),
size: file_size,
resources,
})
},
"csv" => {
let file_name = path.file_name()?.to_str()?;
if file_name.contains("_all") {
Expand Down
14 changes: 14 additions & 0 deletions collab-importer/tests/notion_test/import_test.rs
Original file line number Diff line number Diff line change
Expand Up @@ -255,6 +255,20 @@ async fn import_blog_post_document_test() {
assert_blog_post(host, &info.workspace_id, root_view).await;
}

#[tokio::test]
async fn import_blog_post_no_subpages_test() {
setup_log();
let workspace_id = uuid::Uuid::new_v4();
let (_cleaner, file_path) = sync_unzip_asset("blog_post_no_subpages").await.unwrap();
let host = "http://test.appflowy.cloud";
let importer = NotionImporter::new(1, &file_path, workspace_id, host.to_string()).unwrap();
let info = importer.import().await.unwrap();
assert_eq!(info.name, "blog_post_no_subpages");

let root_view = &info.views()[0];
assert_blog_post(host, &info.workspace_id, root_view).await;
}

#[tokio::test]
async fn import_project_test() {
let workspace_id = uuid::Uuid::new_v4();
Expand Down

0 comments on commit 64e8a7d

Please sign in to comment.