Skip to content

Commit

Permalink
add changes from NVIDIA#389
Browse files Browse the repository at this point in the history
Signed-off-by: Sarah Yurick <[email protected]>
  • Loading branch information
sarahyurick committed Dec 3, 2024
1 parent edd6262 commit 6846945
Show file tree
Hide file tree
Showing 22 changed files with 2,708 additions and 2,513 deletions.
60 changes: 38 additions & 22 deletions nemo_curator/modules/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,54 +25,70 @@
from .config import FuzzyDuplicatesConfig, SemDedupConfig
from .dataset_ops import blend_datasets, Shuffle
from .exact_dedup import ExactDuplicates
from .filter import Filter, Score, ScoreFilter, ParallelScoreFilter
from .filter import Filter, Score, ScoreFilter
from .meta import Sequential
from .modify import Modify
from .task import TaskDecontamination

# GPU packages
LSH = gpu_only_import_from("nemo_curator.modules.fuzzy_dedup", "LSH")
MinHash = gpu_only_import_from("nemo_curator.modules.fuzzy_dedup", "MinHash")
FuzzyDuplicates = gpu_only_import_from(
"nemo_curator.modules.fuzzy_dedup", "FuzzyDuplicates"
MinHash = gpu_only_import_from("nemo_curator.modules.fuzzy_dedup.minhash", "MinHash")
LSH = gpu_only_import_from("nemo_curator.modules.fuzzy_dedup.lsh", "LSH")
_MapBuckets = gpu_only_import_from(
"nemo_curator.modules.fuzzy_dedup._mapbuckets", "_MapBuckets"
)
_Shuffle = gpu_only_import_from("nemo_curator.modules.fuzzy_dedup._shuffle", "_Shuffle")
JaccardSimilarity = gpu_only_import_from(
"nemo_curator.modules.fuzzy_dedup.jaccardsimilarity", "JaccardSimilarity"
)
BucketsToEdges = gpu_only_import_from(
"nemo_curator.modules.fuzzy_dedup", "BucketsToEdges"
"nemo_curator.modules.fuzzy_dedup.bucketstoedges", "BucketsToEdges"
)
ConnectedComponents = gpu_only_import_from(
"nemo_curator.modules.fuzzy_dedup.connectedcomponents", "ConnectedComponents"
)
FuzzyDuplicates = gpu_only_import_from(
"nemo_curator.modules.fuzzy_dedup.fuzzyduplicates", "FuzzyDuplicates"
)
# Pytorch related imports must come after all imports that require cugraph,
# because of context cleanup issues b/w pytorch and cugraph
# See this issue: https://github.com/rapidsai/cugraph/issues/2718
SemDedup = gpu_only_import_from("nemo_curator.modules.semantic_dedup", "SemDedup")
EmbeddingCreator = gpu_only_import_from(
"nemo_curator.modules.semantic_dedup", "EmbeddingCreator"
"nemo_curator.modules.semantic_dedup.embeddings", "EmbeddingCreator"
)
ClusteringModel = gpu_only_import_from(
"nemo_curator.modules.semantic_dedup", "ClusteringModel"
"nemo_curator.modules.semantic_dedup.clusteringmodel", "ClusteringModel"
)
SemanticClusterLevelDedup = gpu_only_import_from(
"nemo_curator.modules.semantic_dedup", "SemanticClusterLevelDedup"
"nemo_curator.modules.semantic_dedup.semanticclusterleveldedup",
"SemanticClusterLevelDedup",
)
SemDedup = gpu_only_import_from(
"nemo_curator.modules.semantic_dedup.semdedup", "SemDedup"
)

__all__ = [
"AddId",
"FuzzyDuplicatesConfig",
"SemDedupConfig",
"blend_datasets",
"Shuffle",
"ExactDuplicates",
"Filter",
"FuzzyDuplicatesConfig",
"FuzzyDuplicates",
"BucketsToEdges",
"LSH",
"MinHash",
"Modify",
"Score",
"ScoreFilter",
"ParallelScoreFilter",
"Sequential",
"Modify",
"TaskDecontamination",
"AddId",
"blend_datasets",
"Shuffle",
"SemDedup",
"SemDedupConfig",
"MinHash",
"LSH",
"_MapBuckets",
"_Shuffle",
"JaccardSimilarity",
"BucketsToEdges",
"ConnectedComponents",
"FuzzyDuplicates",
"EmbeddingCreator",
"ClusteringModel",
"SemanticClusterLevelDedup",
"SemDedup",
]
Loading

0 comments on commit 6846945

Please sign in to comment.