From 93b13c710a8e76ab151930e945c52351a69e8025 Mon Sep 17 00:00:00 2001 From: pmitev Date: Mon, 13 Sep 2021 19:59:29 +0200 Subject: [PATCH] More fixes --- docs/Bio/NCBI-taxonomy.md | 24 +++++++++++------------- docs/Case_studies/List.md | 2 +- docs/Case_studies/multiple_files_I.md | 10 ++++------ 3 files changed, 16 insertions(+), 20 deletions(-) diff --git a/docs/Bio/NCBI-taxonomy.md b/docs/Bio/NCBI-taxonomy.md index f963d416..b930e321 100644 --- a/docs/Bio/NCBI-taxonomy.md +++ b/docs/Bio/NCBI-taxonomy.md @@ -137,7 +137,7 @@ Might not be the best solution but it is easy to read and modify, for now. Note, $ ./01.tabulate-names.awk names.dmp | sort -g -k 1 > names.tab # Or with bzip2 compression "on the fly" -$ ./01.tabulate-names.awk <(bzcat names.dmp.bz2) | bzip2 -c > names.tab.bz2 +$ ./01.tabulate-names.awk <(bzcat names.dmp.bz2) | sort -g -k 1 | bzip2 -c > names.tab.bz2 ``` ??? note "01.tabulate-names.awk" @@ -150,9 +150,9 @@ $ ./01.tabulate-names.awk <(bzcat names.dmp.bz2) | bzip2 -c > names.tab.bz2 $4 ~ "scientific name" { sciname[$1*1]= unds(Clean($2)); next} - $4 ~ "common name" { com_name[$1*1]= Cap(Clean($2)); next} - + # Order is important, since the second case will match lines that match the first case. $4 ~ "genbank common name" { genbank[$1*1]= unds(Clean($2)); next} + $4 ~ "common name" { com_name[$1*1]= Cap(Clean($2)); next} END{ for(i in sciname) print i"|"sciname[i]"|"com_name[i]"|"genbank[i] @@ -171,9 +171,13 @@ $ ./01.tabulate-names.awk <(bzcat names.dmp.bz2) | bzip2 -c > names.tab.bz2 function Cap (string) { return toupper(substr(string,0,1))substr(string,2) } ``` -Note that this script will keep the last information for the corresponding match for each ID. To prevent this we need to take care that any subsequent match is ignored +Note that this script will keep the last values for any match of the same ID. It appers that the database have repeated lines that does not contain complete information and the tabulated data get destroyed. To prevent this, we need to take care that any subsequent match will be ignored. +``` bash +$ ./01.tabulate-names-first.awk names.dmp | sort -g -k 1 > names-first.tab +``` + ??? note "01.tabulate-names-first.awk" ``` awk #!/usr/bin/awk -f @@ -184,9 +188,10 @@ Note that this script will keep the last information for the corresponding match $4 ~ "scientific name" { if (! sciname[$1*1] ) sciname[$1*1]= unds(Clean($2)); next} + # Order is important, since the second case will match lines that match the first case. + $4 ~ "genbank common name" { if (! genbank[$1*1] ) genbank[$1*1]= unds(Clean($2)); next} $4 ~ "common name" { if (! com_name[$1*1]) com_name[$1*1]= Cap(Clean($2)); next} - $4 ~ "genbank common name" { if (! genbank[$1*1] ) genbank[$1*1]= unds(Clean($2)); next} END{ for(i in sciname) print i"|"sciname[i]"|"com_name[i]"|"genbank[i] @@ -228,16 +233,9 @@ Now we can use the tabulated data in `names.tab` and perform the replacement in Again, this might not be the best way but it works. The suggested solutions could be easily merged into a single script. I would prefer to have them in steps, so I can make sure that the first step has completed successfully (*it takes some time*) before I continue. Also I can filter the unnecessary data in the newly tabulated file and use only relevant data or alter further if I need. ``` bash -$ ./02.substitute.awk names.tab hg38.100way.scientificNames.nh > NEW.g38.100way.scientificNames.nh - -# Or with bzip2 compression "on the fly" -$ ./02.substitute.awk <(bzcat names.tab.bz2) hg38.100way.scientificNames.nh > NEW.g38.100way.scientificNames.nh +$ ./02.substitute.awk names-first.tab hg38.100way.scientificNames.nh > NEW.g38.100way.scientificNames.nh ``` - -``` bash -$ ./02.substitute.awk names.tab hg38.100way.scientificNames.nh -``` ??? note "02.substitute.awk" ``` awk linenums="1" #!/usr/bin/awk -f diff --git a/docs/Case_studies/List.md b/docs/Case_studies/List.md index 91732116..4c1e63c0 100644 --- a/docs/Case_studies/List.md +++ b/docs/Case_studies/List.md @@ -35,7 +35,7 @@ Here is a collection of mine and contributed awk scripts. * **[Gaussian smearing](Gaussian_smearing.md)** _trivial task done with awk - example how to use functions_ * **[Linear interpolation](Linear_interpolation.md)** - _use linear interpolation to resample your date on different grid_ + _use linear interpolation to resample your data on different grid_ ## Physics oriented * **[Dipole moment example](Dipole_moment.md)** diff --git a/docs/Case_studies/multiple_files_I.md b/docs/Case_studies/multiple_files_I.md index 45892215..15ee6506 100644 --- a/docs/Case_studies/multiple_files_I.md +++ b/docs/Case_studies/multiple_files_I.md @@ -35,13 +35,12 @@ Below, it is just one possible way to do it. First we need to have a list of all ``` awk #!/usr/bin/awk -f - { - names[$1]= 1; + { data[$1][ARGIND]= $2 } END { - for (i in names) print i"\t\t"data[i][1]"\t\t"data[i][2] + for (i in data) print i"\t"data[i][1]"\t"data[i][2] } ``` @@ -138,12 +137,11 @@ Leave the extra blanks for the first attempt. We will use this problem (cleaning BEGIN{ FS="|" } { - id[$1]= 1; data[$1][FILENAME]= $2 } END { - for (i in id) print trim(i)"|"trim(data[i]["scientific"])"|"trim(data[i]["genbank"]) + for (i in data) print trim(i)"|"trim(data[i]["scientific"])"|"trim(data[i]["genbank"]) } function trim (x) { @@ -153,7 +151,7 @@ Leave the extra blanks for the first attempt. We will use this problem (cleaning } ``` -??? "Solution usung join uggested by Amrei Binzer-Panchal, 2021.01.18" +??? "Solution usung join suggested by Amrei Binzer-Panchal, 2021.01.18" ``` bash $ join -a1 -a2 -j 1 -o 0,1.2,2.2 -e "NULL" -t "|" <(sort scientific) <(sort genbank)