From 1c73f435d2fa3972efa61e18e662ec211dc715d6 Mon Sep 17 00:00:00 2001 From: Diego Date: Tue, 12 Mar 2024 11:10:34 +0100 Subject: [PATCH] metadata serialization done --- Cargo.toml | 5 +- examples/metadata_bench.rs | 15 +- resources/metadata_coordinates.csv | 501 ++++++++++++++++++++++++++ src/error.rs | 2 + src/io/mod.rs | 23 ++ src/metadata/mod.rs | 79 ++-- src/metadata/params.rs | 25 ++ src/metadata/structure/coordinates.rs | 95 +++++ src/metadata/structure/mod.rs | 64 ++++ src/storage/layout/metadata.rs | 102 ------ src/storage/layout/mod.rs | 1 - 11 files changed, 751 insertions(+), 161 deletions(-) create mode 100644 resources/metadata_coordinates.csv create mode 100644 src/metadata/params.rs delete mode 100644 src/storage/layout/metadata.rs diff --git a/Cargo.toml b/Cargo.toml index ac39d2e..8be70b1 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,5 +1,5 @@ [package] -authors = [ "Ángel Iglesias Préstamo " ] +authors = [ "Ángel Iglesias Préstamo " ,"Diego Martín Fernández " ] name = "remote-hdt" version = "0.0.1" edition = "2021" @@ -16,6 +16,7 @@ rio_xml = "0.8.4" rio_api = "0.8.4" rayon = "1.8.0" parking_lot = "0.12" +csv = "1.3.0" [target.'cfg(not(target_env = "msvc"))'.dependencies] jemallocator = "0.5.0" @@ -23,4 +24,4 @@ jemallocator = "0.5.0" [profile.release] codegen-units = 1 opt-level = 3 -lto = "fat" \ No newline at end of file +lto = "fat" diff --git a/examples/metadata_bench.rs b/examples/metadata_bench.rs index df2efe8..2d9f723 100644 --- a/examples/metadata_bench.rs +++ b/examples/metadata_bench.rs @@ -3,25 +3,22 @@ use remote_hdt::metadata::Metadata; use remote_hdt::storage::params::Backend; use remote_hdt::storage::params::ReferenceSystem; use remote_hdt::storage::params::Serialization; -use remote_hdt::storage::layout::metadata::MetadataLayout; +use remote_hdt::metadata::structure::coordinates::CoordinatesStructure; use remote_hdt::storage::params::ChunkingStrategy; fn main() -> Result<(), RemoteHDTError> { - let rdf_path = "resources/1-lubm.ttl"; - let metadata_path = ""; + //let rdf_path = "resources/1-lubm.ttl"; + let metadata_path = "resources/metadata_coordinates.csv"; let zarr_path = "1-lubm-metadata.zarr"; - let fields = vec!["X_pos", "Y_pos"]; - let mut metadata = Metadata::new( MetadataLayout,Serialization::Zarr); + let fields = vec!["ID","X_pos", "Y_pos"]; // TODO: fix error when the first string is bigger example => vec!["triple_id","X_pos", "Y_pos"] + let mut metadata = Metadata::new( CoordinatesStructure,Serialization::Zarr); metadata .serialize( Backend::FileSystem(zarr_path), - rdf_path, - ChunkingStrategy::Sharding(1024), - ReferenceSystem::SPO, - metadata_path, + ChunkingStrategy::Sharding(250), fields) .unwrap(); diff --git a/resources/metadata_coordinates.csv b/resources/metadata_coordinates.csv new file mode 100644 index 0000000..4511182 --- /dev/null +++ b/resources/metadata_coordinates.csv @@ -0,0 +1,501 @@ +triple_id,x_pos,y_pos +1,10,20 +2,15,25 +3,30,40 +4,50,60 +5,22,35 +6,48,52 +7,16,28 +8,39,42 +9,12,18 +10,55,70 +11,21,38 +12,45,58 +13,33,47 +14,29,36 +15,62,75 +16,11,23 +17,28,41 +18,49,64 +19,18,30 +20,58,72 +21,26,37 +22,43,55 +23,31,46 +24,19,26 +25,60,68 +26,14,22 +27,37,49 +28,54,66 +29,24,32 +30,65,80 +31,35,43 +32,56,69 +33,40,50 +34,27,39 +35,70,85 +36,32,44 +37,52,63 +38,42,54 +39,38,48 +40,75,90 +41,34,45 +42,59,74 +43,47,57 +44,36,51 +45,80,95 +46,38,47 +47,61,76 +48,51,61 +49,44,59 +50,85,100 +51,41,53 +52,64,78 +53,54,65 +54,48,60 +55,90,105 +56,46,58 +57,68,82 +58,57,70 +59,50,63 +60,95,110 +61,53,67 +62,70,84 +63,59,71 +64,52,62 +65,100,115 +66,57,69 +67,72,88 +68,63,77 +69,56,66 +70,105,120 +71,61,73 +72,76,91 +73,65,80 +74,58,68 +75,110,125 +76,68,81 +77,78,93 +78,67,79 +79,64,74 +80,115,130 +81,71,85 +82,84,99 +83,70,83 +84,72,86 +85,120,135 +86,76,90 +87,90,105 +88,75,88 +89,78,94 +90,125,140 +91,81,97 +92,94,110 +93,80,95 +94,82,96 +95,130,145 +96,86,100 +97,100,115 +98,85,99 +99,88,103 +100,135,150 +101,91,106 +102,104,119 +103,87,102 +104,90,107 +105,140,155 +106,96,110 +107,110,125 +108,92,108 +109,95,112 +110,145,160 +111,101,117 +112,114,129 +113,98,113 +114,101,118 +115,150,165 +116,106,120 +117,120,135 +118,103,118 +119,106,122 +120,155,170 +121,111,127 +122,124,140 +123,107,123 +124,110,127 +125,160,175 +126,116,130 +127,130,145 +128,113,128 +129,116,132 +130,165,180 +131,121,137 +132,134,149 +133,119,135 +134,122,138 +135,170,185 +136,126,140 +137,140,155 +138,125,140 +139,128,144 +140,175,190 +141,131,147 +142,144,160 +143,127,142 +144,130,147 +145,180,195 +146,136,150 +147,150,165 +148,133,148 +149,136,152 +150,185,200 +151,141,157 +152,154,170 +153,129,145 +154,132,147 +155,190,205 +156,146,160 +157,160,175 +158,137,152 +159,140,155 +160,195,210 +161,151,167 +162,164,180 +163,139,155 +164,142,157 +165,200,215 +166,156,170 +167,170,185 +168,143,158 +169,146,163 +170,205,220 +171,161,177 +172,174,190 +173,149,164 +174,152,168 +175,210,225 +176,166,180 +177,180,195 +178,153,169 +179,156,172 +180,215,230 +181,171,187 +182,184,200 +183,159,174 +184,162,177 +185,220,235 +186,176,190 +187,190,205 +188,165,180 +189,168,184 +190,225,240 +191,181,197 +192,194,210 +193,169,185 +194,172,188 +195,230,245 +196,186,200 +197,200,215 +198,173,189 +199,176,192 +200,235,250 +201,191,207 +202,204,220 +203,179,195 +204,182,197 +205,240,255 +206,196,210 +207,210,225 +208,175,190 +209,178,194 +210,245,260 +211,201,217 +212,214,230 +213,189,204 +214,192,207 +215,250,265 +216,206,220 +217,220,235 +218,185,200 +219,188,203 +220,255,270 +221,211,227 +222,224,240 +223,199,215 +224,202,217 +225,260,275 +226,216,230 +227,230,245 +228,195,210 +229,198,214 +230,265,280 +231,221,237 +232,234,250 +233,209,224 +234,212,227 +235,270,285 +236,226,240 +237,240,255 +238,205,220 +239,208,223 +240,275,290 +241,231,247 +242,244,260 +243,219,235 +244,222,237 +245,280,295 +246,236,250 +247,250,265 +248,215,230 +249,218,233 +250,285,300 +251,241,257 +252,254,270 +253,229,245 +254,232,247 +255,290,305 +256,246,260 +257,260,275 +258,225,240 +259,228,243 +260,295,310 +261,251,267 +262,264,280 +263,239,255 +264,242,257 +265,300,315 +266,256,270 +267,270,285 +268,235,250 +269,238,253 +270,305,320 +271,261,277 +272,274,290 +273,249,265 +274,252,267 +275,310,325 +276,266,280 +277,280,295 +278,245,260 +279,248,263 +280,315,330 +281,271,287 +282,284,300 +283,259,275 +284,262,277 +285,320,335 +286,276,290 +287,290,305 +288,255,270 +289,258,273 +290,325,340 +291,281,297 +292,294,310 +293,269,285 +294,272,287 +295,330,345 +296,286,300 +297,300,315 +298,265,280 +299,268,283 +300,335,350 +301,291,307 +302,304,320 +303,279,295 +304,282,297 +305,340,355 +306,296,310 +307,310,325 +308,275,290 +309,278,293 +310,345,360 +311,301,317 +312,314,330 +313,289,305 +314,292,307 +315,350,365 +316,306,320 +317,320,335 +318,285,300 +319,288,303 +320,355,370 +321,311,327 +322,324,340 +323,299,315 +324,302,317 +325,360,375 +326,316,330 +327,330,345 +328,295,310 +329,298,313 +330,365,380 +331,321,337 +332,334,350 +333,309,325 +334,312,327 +335,370,385 +336,326,340 +337,340,355 +338,305,320 +339,308,323 +340,375,390 +341,331,347 +342,344,360 +343,319,335 +344,322,337 +345,380,395 +346,336,350 +347,350,365 +348,315,330 +349,318,333 +350,385,400 +351,341,357 +352,354,370 +353,329,345 +354,332,347 +355,390,405 +356,346,360 +357,360,375 +358,325,340 +359,328,343 +360,395,410 +361,351,367 +362,364,380 +363,339,355 +364,342,357 +365,400,415 +366,356,370 +367,370,385 +368,335,350 +369,338,353 +370,405,420 +371,361,377 +372,374,390 +373,349,365 +374,352,367 +375,410,425 +376,366,380 +377,380,395 +378,345,360 +379,348,363 +380,415,430 +381,371,387 +382,384,400 +383,359,375 +384,362,377 +385,420,435 +386,376,390 +387,390,405 +388,355,370 +389,358,373 +390,425,440 +391,381,397 +392,394,410 +393,369,385 +394,372,387 +395,430,445 +396,386,400 +397,400,415 +398,365,380 +399,368,383 +400,435,450 +401,391,407 +402,404,420 +403,379,395 +404,382,397 +405,440,455 +406,396,410 +407,410,425 +408,375,390 +409,378,393 +410,445,460 +411,401,417 +412,414,430 +413,389,405 +414,392,407 +415,450,465 +416,406,420 +417,420,435 +418,385,400 +419,388,403 +420,455,470 +421,411,427 +422,424,440 +423,399,415 +424,402,417 +425,460,475 +426,416,430 +427,430,445 +428,395,410 +429,398,413 +430,465,480 +431,421,437 +432,434,450 +433,409,425 +434,412,427 +435,470,485 +436,426,440 +437,440,455 +438,405,420 +439,408,423 +440,475,490 +441,431,447 +442,444,460 +443,419,435 +444,422,437 +445,480,495 +446,436,450 +447,450,465 +448,415,430 +449,418,433 +450,485,500 +451,441,457 +452,454,470 +453,429,445 +454,432,447 +455,490,505 +456,446,460 +457,460,475 +458,425,440 +459,428,443 +460,495,510 +461,451,467 +462,464,480 +463,439,455 +464,442,457 +465,500,515 +466,456,470 +467,470,485 +468,435,450 +469,438,453 +470,505,520 +471,461,477 +472,474,490 +473,449,465 +474,452,467 +475,510,525 +476,466,480 +477,480,495 +478,445,460 +479,448,463 +480,515,530 +481,471,487 +482,484,500 +483,459,475 +484,462,477 +485,520,535 +486,476,490 +487,490,505 +488,455,470 +489,458,473 +490,525,540 +491,481,497 +492,494,510 +493,469,485 +494,472,487 +495,530,545 +496,486,500 +497,500,515 +498,465,480 +499,468,483 +500,535,550 \ No newline at end of file diff --git a/src/error.rs b/src/error.rs index 7868acb..96ac9bf 100644 --- a/src/error.rs +++ b/src/error.rs @@ -58,6 +58,8 @@ pub enum RemoteHDTError { ReadOnlyBackend, #[error("Error while parsing the RDF graph")] RdfParse, + #[error("Error while parsing the CSV metadata")] + CSVParse, #[error(transparent)] NonZero(#[from] NonZeroError), } diff --git a/src/io/mod.rs b/src/io/mod.rs index 7bf6a33..8904a83 100644 --- a/src/io/mod.rs +++ b/src/io/mod.rs @@ -3,6 +3,8 @@ use rio_api::parser::TriplesParser; use std::collections::HashSet; use std::fs::File; use std::io::BufReader; +use std::error::Error; +use csv::ReaderBuilder; use crate::dictionary::Dictionary; use crate::error::ParserError; @@ -130,6 +132,7 @@ trait Backend::Error>> { } pub struct RdfParser; +pub struct CSVParser; impl RdfParser { pub fn parse(path: &str, reference_system: &ReferenceSystem) -> RdfParserResult { @@ -142,3 +145,23 @@ impl RdfParser { } } } + +impl CSVParser { + pub fn parse(filename: &str) -> Result>, Box> { + let file = File::open(filename)?; + let mut reader = ReaderBuilder::new() + .has_headers(true) + .from_reader(file); + + let mut records: Vec> = Vec::new(); + + for result in reader.records() { + let record = result?; + let values: Vec = record.iter().map(|s| s.parse::().unwrap()).collect(); + + records.push(values); + } + + Ok(records) + } +} diff --git a/src/metadata/mod.rs b/src/metadata/mod.rs index 2ee7aeb..5e43fef 100644 --- a/src/metadata/mod.rs +++ b/src/metadata/mod.rs @@ -5,10 +5,10 @@ use std::collections::HashSet; use std::sync::Arc; use serde_json::Map; +use zarrs::metadata; +use crate::io::CSVParser; use crate::storage::layout; -use crate::storage::layout::metadata::MetadataLayout; -use crate::storage::layout::tabular::TabularLayout; use crate::utils::rdf_to_value; use crate::dictionary::Dictionary; @@ -22,6 +22,7 @@ use crate::storage::params::Dimensionality; use crate::storage::params::ReferenceSystem; use crate::storage::params::Serialization; use crate::storage::params::ChunkingStrategy; +use crate::metadata::params::MetadataDimensionality; use fcsd::Set; use zarrs::array::Array; @@ -31,43 +32,44 @@ use zarrs::opendal::services::Fs; use zarrs::opendal::services::Http; use zarrs::opendal::Operator; use zarrs::storage::store::OpendalStore; +use self::structure::Structure; + use super::utils::hash_to_set; -pub type MetadataResult = Result; + + + pub mod structure; +pub mod params; + +pub type MetadataResult = Result; const ARRAY_NAME: &str = "/group/RemoteHDT"; // TODO: parameterize this pub struct Metadata { - flatten_graph: Vec<(String)>, serialization: Serialization, - dictionary: Dictionary, array: Option>, - dimensionality: Dimensionality, - layout: Box>, + dimensionality: MetadataDimensionality, + structure: Box>, + } impl Metadata { - pub fn new( layout: impl Layout + 'static, serialization: Serialization) -> Self { + pub fn new( structure: impl Structure + 'static, serialization: Serialization) -> Self { Metadata { - flatten_graph: Vec::::default(), serialization: serialization, - dictionary: Dictionary::default(), array: None, dimensionality: Default::default(), - layout: Box::new(layout), + structure: Box::new(structure), } } pub fn serialize<'a>( &mut self, store: Backend<'a>, - rdf_path: &str, - chunking_strategy: ChunkingStrategy, - reference_system: ReferenceSystem, - metadata_path: &str, + chunking_strategy: ChunkingStrategy, fields: Vec<&str>, ) -> MetadataResult<&mut Self> { @@ -96,52 +98,35 @@ impl Metadata { // 2. We can create the FileSystemStore appropiately let store = Arc::new(OpendalStore::new(operator.blocking())); - let graph = match RdfParser::parse(rdf_path, &reference_system) { - Ok((graph, dictionary)) => { - self.dictionary = dictionary; - self.dimensionality = Dimensionality::new(&self.dictionary, &graph); - graph + let metadata = match CSVParser::parse(metadata_path) { + Ok(result) => { + self.dimensionality = MetadataDimensionality::new(result.len(), fields.len()); + result } - Err(_) => return Err(RemoteHDTError::RdfParse), + Err(_) => return Err(RemoteHDTError::CSVParse), }; - //Flatten the graph into triples - let mut count = 0; - for i in graph.iter() { - for j in i.iter() { - self.flatten_graph.push(format!["{};{};{}",count, j.0, j.1]) - } - count += 1; - } - - //TODO: change the implementation so it is only done here the flatten - let triples:HashSet<_> = self.flatten_graph.clone().into_iter().collect(); - let subjects = self.dictionary.subjects(); - let predicates = self.dictionary.predicates(); - let objects = self.dictionary.objects(); + + let arr = ArrayBuilder::new( - self.layout.shape(&self.dimensionality), - self.layout.data_type(), - self.layout + self.structure.shape(&self.dimensionality), + self.structure.data_type(), + self.structure .chunk_shape(chunking_strategy, &self.dimensionality), - self.layout.fill_value(), + self.structure.fill_value(), ) - .dimension_names(self.layout.dimension_names(&reference_system)) - .array_to_bytes_codec(self.layout.array_to_bytes_codec(&self.dimensionality)?) + .dimension_names(self.structure.dimension_names()) + .array_to_bytes_codec(self.structure.array_to_bytes_codec(&self.dimensionality)?) .attributes({ let mut attributes = Map::new(); - attributes.insert("triples".into(), rdf_to_value(Set::new(hash_to_set(triples)).unwrap())); - attributes.insert("subjects".into(), rdf_to_value(subjects)); - attributes.insert("predicates".into(), rdf_to_value(predicates)); - attributes.insert("objects".into(), rdf_to_value(objects)); - attributes.insert("reference_system".into(), reference_system.as_ref().into()); + attributes.insert("metadata_fields".into(), rdf_to_value(Set::new(fields).unwrap())); attributes }) .build(store, ARRAY_NAME)?; arr.store_metadata()?; - self.layout.serialize(arr, graph)?; + self.structure.serialize(arr, metadata)?; Ok(self) } diff --git a/src/metadata/params.rs b/src/metadata/params.rs new file mode 100644 index 0000000..b4f8b46 --- /dev/null +++ b/src/metadata/params.rs @@ -0,0 +1,25 @@ +#[derive(Default)] +pub struct MetadataDimensionality { + pub (crate) metadata_size: usize, + pub (crate) fields_size : usize +} + +impl MetadataDimensionality { + + pub fn new(number_of_triples: usize, number_of_fields: usize)-> Self{ + MetadataDimensionality{ + metadata_size: number_of_triples, + fields_size: number_of_fields, + } + } + + pub fn get_metadata_size(&self)-> u64{ + self.metadata_size as u64 + } + + + pub fn get_fields_size(&self) -> u64{ + self.fields_size as u64 + } + +} \ No newline at end of file diff --git a/src/metadata/structure/coordinates.rs b/src/metadata/structure/coordinates.rs index e69de29..9324090 100644 --- a/src/metadata/structure/coordinates.rs +++ b/src/metadata/structure/coordinates.rs @@ -0,0 +1,95 @@ +use std::num::NonZeroU64; + +use parking_lot::Mutex; +use sprs::TriMat; +use zarrs::array::codec::array_to_bytes::sharding::ShardingCodecBuilder; +use zarrs::array::codec::ArrayToBytesCodecTraits; +use zarrs::array::codec::GzipCodec; +use zarrs::array::ChunkGrid; +use zarrs::array::DataType; +use zarrs::array::DimensionName; +use zarrs::array::FillValue; + +use super::ChunkingStrategy; +use super::Dimensionality; +use super::MetadataStructure; +use super::ReferenceSystem; +use super::MetadataResult; +use super::Structure; +use super::StructureOps; + +use crate::io::Graph; +use crate::metadata::params::MetadataDimensionality; + + + +type Chunk = Vec; + +pub struct CoordinatesStructure; + +impl Structure for CoordinatesStructure { + fn shape(&self, dimensionality: &MetadataDimensionality) -> Vec { + vec![dimensionality.get_metadata_size(), dimensionality.get_fields_size()] + } + + fn data_type(&self) -> DataType { + DataType::UInt32 + } + + fn chunk_shape(&self, chunking_strategy: ChunkingStrategy, dimensionality: &MetadataDimensionality) -> ChunkGrid { + vec![chunking_strategy.into(), NonZeroU64::new(dimensionality.get_fields_size()).unwrap()].into() // TODO: make this a constant value + } + + fn fill_value(&self) -> FillValue { + FillValue::from(0u32) + } + + fn dimension_names(&self) -> Option> { + Some(vec![ + DimensionName::new("Triples_metadata"), + DimensionName::new("Fields"), + ]) + } + + fn array_to_bytes_codec( + &self, + dimensionality: &MetadataDimensionality, + ) -> MetadataResult> { + let mut sharding_codec_builder = ShardingCodecBuilder::new(vec![1, dimensionality.get_fields_size()].try_into()?); + sharding_codec_builder.bytes_to_bytes_codecs(vec![Box::new(GzipCodec::new(5)?)]); + Ok(Box::new(sharding_codec_builder.build())) + } +} + +impl StructureOps for CoordinatesStructure { + + + fn store_chunk_elements(&self, chunk: &[Chunk], _: usize) -> Vec { + let mut ans = Vec::new(); + for i in chunk { + for &j in i { + ans.push(j); + } + } + ans + } + + fn retrieve_chunk_elements( + &mut self, + matrix: &Mutex>, + first_term_index: usize, // TODO: will first_term_index instead of chunk[0] do the trick? + chunk: &[usize], + ) { + matrix + .lock() + .add_triplet(chunk[0], chunk[2], chunk[1] as usize); + } + + fn sharding_factor(&self, dimensionality: &MetadataDimensionality) -> usize { + dimensionality.metadata_size * dimensionality.fields_size + } + + fn metadata_iter(&self, metadata_structure: MetadataStructure) -> Vec { + metadata_structure + } +} diff --git a/src/metadata/structure/mod.rs b/src/metadata/structure/mod.rs index f5a66bc..58a0826 100644 --- a/src/metadata/structure/mod.rs +++ b/src/metadata/structure/mod.rs @@ -1,4 +1,5 @@ use parking_lot::Mutex; +use sprs::CsMat; use sprs::TriMat; use std::sync::atomic::AtomicU64; use std::sync::atomic::Ordering; @@ -18,19 +19,82 @@ use crate::utils::columns_per_shard; use crate::utils::rows_per_shard; use crate::utils::value_to_term; +use super::params::MetadataDimensionality; use super::ChunkingStrategy; use super::Dimensionality; use super::ReferenceSystem; +use super::MetadataResult; type ArrayToBytesCodec = Box; pub mod coordinates; +type MetadataStructure = Vec>; pub trait StructureOps { + fn retrieve_attributes(&mut self, arr: &Array){ + + } + + fn serialize(&mut self, arr: Array, metadata: MetadataStructure) -> MetadataResult<()> { + let columns = arr.shape()[1] as usize; + let count = AtomicU64::new(0); + let binding = self.metadata_iter(metadata.to_owned()); + let iter = binding.chunks_exact(rows_per_shard(&arr) as usize); + let remainder = iter.remainder(); + + for chunk in iter { + let slice = self.store_chunk_elements(chunk, columns); + arr.store_chunk_elements(&[count.load(Ordering::Relaxed), 0], slice)?; + count.fetch_add(1, Ordering::Relaxed); + } + + if !remainder.is_empty() { + arr.store_array_subset_elements( + &ArraySubset::new_with_start_shape( + vec![count.load(Ordering::Relaxed) * rows_per_shard(&arr), 0], + vec![remainder.len() as u64, columns_per_shard(&arr)], + )?, + self.store_chunk_elements(remainder, columns), + )?; + } + + Ok(()) + } + + fn parse( + &mut self, + arr: &Array, + dimensionality: &MetadataDimensionality, + ) { + } + + fn metadata_iter(&self, graph: MetadataStructure) -> Vec; + fn store_chunk_elements(&self, chunk: &[C], columns: usize) -> Vec; + fn retrieve_chunk_elements( + &mut self, + matrix: &Mutex>, + first_term_idx: usize, + chunk: &[usize], + ); + fn sharding_factor(&self, dimensionality: &MetadataDimensionality) -> usize; } pub trait Structure: StructureOps { + + fn shape(&self, dimensionality: &MetadataDimensionality) -> Vec; + fn data_type(&self) -> DataType; + fn chunk_shape( + &self, + chunking_strategy: ChunkingStrategy, + dimensionality: &MetadataDimensionality, + ) -> ChunkGrid; + fn fill_value(&self) -> FillValue; + fn dimension_names(&self) -> Option>; + fn array_to_bytes_codec( + &self, + dimensionality: &MetadataDimensionality, + ) -> MetadataResult; } diff --git a/src/storage/layout/metadata.rs b/src/storage/layout/metadata.rs deleted file mode 100644 index ed2e509..0000000 --- a/src/storage/layout/metadata.rs +++ /dev/null @@ -1,102 +0,0 @@ -use std::num::NonZeroU64; - -use parking_lot::Mutex; -use sprs::TriMat; -use zarrs::array::codec::array_to_bytes::sharding::ShardingCodecBuilder; -use zarrs::array::codec::ArrayToBytesCodecTraits; -use zarrs::array::codec::GzipCodec; -use zarrs::array::ChunkGrid; -use zarrs::array::DataType; -use zarrs::array::DimensionName; -use zarrs::array::FillValue; - -use super::ChunkingStrategy; -use super::Dimensionality; -use super::ReferenceSystem; -use super::StorageResult; - -use crate::io::Graph; -use crate::storage::layout::LayoutOps; -use crate::storage::Layout; - -type Chunk = (u32, u32, u32); - -pub struct MetadataLayout; - -impl Layout for MetadataLayout { - fn shape(&self, dimensionality: &Dimensionality) -> Vec { - vec![dimensionality.get_graph_size(), 3] - } - - fn data_type(&self) -> DataType { - DataType::UInt32 - } - - fn chunk_shape(&self, chunking_strategy: ChunkingStrategy, _: &Dimensionality) -> ChunkGrid { - vec![chunking_strategy.into(), NonZeroU64::new(3).unwrap()].into() // TODO: make this a constant value - } - - fn fill_value(&self) -> FillValue { - FillValue::from(0u32) - } - - fn dimension_names(&self, _: &ReferenceSystem) -> Option> { - Some(vec![ - DimensionName::new("Triples"), - DimensionName::new("Fields"), - ]) - } - - fn array_to_bytes_codec( - &self, - _: &Dimensionality, - ) -> StorageResult> { - let mut sharding_codec_builder = ShardingCodecBuilder::new(vec![1, 3].try_into()?); - sharding_codec_builder.bytes_to_bytes_codecs(vec![Box::new(GzipCodec::new(5)?)]); - Ok(Box::new(sharding_codec_builder.build())) - } -} - -impl LayoutOps for MetadataLayout { - fn graph_iter(&self, graph: Graph) -> Vec { - graph - .iter() - .enumerate() - .flat_map(|(first_term, triples)| { - - let count = 0; - triples - .iter() - .map(|&(second_term, third_term)| { - (count as u32, 1, 1) - }) - .collect::>() - }) - .collect::>() - } - - fn store_chunk_elements(&self, chunk: &[Chunk], _: usize) -> Vec { - let mut ans = Vec::new(); - for &(first_term, second_term, third_term) in chunk { - ans.push(first_term); - ans.push(second_term); - ans.push(third_term); - } - ans - } - - fn retrieve_chunk_elements( - &mut self, - matrix: &Mutex>, - first_term_index: usize, // TODO: will first_term_index instead of chunk[0] do the trick? - chunk: &[usize], - ) { - matrix - .lock() - .add_triplet(chunk[0], chunk[2], chunk[1] as usize); - } - - fn sharding_factor(&self, dimensionality: &Dimensionality) -> usize { - dimensionality.first_term_size * dimensionality.third_term_size - } -} diff --git a/src/storage/layout/mod.rs b/src/storage/layout/mod.rs index e1fd552..a50e2a8 100644 --- a/src/storage/layout/mod.rs +++ b/src/storage/layout/mod.rs @@ -28,7 +28,6 @@ type ArrayToBytesCodec = Box; pub mod matrix; pub mod tabular; -pub mod metadata; pub trait LayoutOps { fn retrieve_attributes(&mut self, arr: &Array) -> StorageResult {