-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathcommon_functions.pl
1637 lines (1489 loc) · 69.9 KB
/
common_functions.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#Below are the subroutines used in this code to save space and to allow other programs to use these subroutines
#Thes subroutines are used frequenty throughout the script above. Unfortunately in Perl there is no way to define the
#number and type of inputs to a function, so it is not "fool-proof", so before using please read the comments and or
#documentation.
#This subroutine pulls the text from between two delimiters, which is a very common operation in this code
#inputs:
# $_[0] = string from which to pull the text
# $_[1] = string which defines the beginning index (this string will not be pulled, will be skipped) beginning delimiter
# $_[2] = string which defines the end index, as previous, will not be returned in pulled string
#outputs:
# $soughtString = the string contained in $_[0] which lies between the delimiters $_[1] and $_[2]
sub getDelimitedText {
$beginIndex = index($_[0], $_[1]) + length($_[1]); #gives index of beginning of string then adds # characters so not capturing delimiter
$endIndex = index($_[0], $_[2], $beginIndex); #gives index of end delimiter
$stringLength = $endIndex - $beginIndex;
$soughtString = substr($_[0], $beginIndex, $stringLength); #pulls substring
return $soughtString; #returns substring
}
#This subroutine takes in a string that is a chemical formula and breaks it down into individual atomic symbols
#and subscripts, where the subscripts and atomic symbols are in the same order. That is for the formula C6H12O6,
#$AtomicSym[1] = H, $Subscript[1]=12.
#inputs:
# $_[0] = string that is the chemical formula, e.g. [C6H12O6]
#outputs:
# \@AtomicSym = reference to store of atomic symbols found, e.g. [C, H, O]
# \@Subscript = reference of store of substripts for found atomic symbols, if no subscript provided the default is 1, e.g. [6, 12, 6]
sub breakdownCF {
#define arrays to work with
my @AtomAndSub = ( );
my @AtomicSym = ( );
my @Subscript = ( );
$CF = $_[0];
while($CF =~ /[A-z][a-z]?[0-9]*/g) { #match pattern to single atomic symbol, while optionally capturing subscript
#for C6H12O6, the first interation should capture $& = "C6"
push(@AtomAndSub, $&); #add the match to the atom and sub array
}
#now that the atomic symbol/subscript pairs have been split from the rest, split the symbol from the subscript
for (my $i = 0; $i <= $#AtomAndSub; $i++) { #for each item in @AtomAndSub
if ($AtomAndSub[$i] =~ /n/g) {
next;
}
while ($AtomAndSub[$i] =~ /[A-z][a-z]?/g) { #for each atomic symbol
#add each symbol to the @AtomicSym array
push @AtomicSym, $&;
}
$tempSub = 1; #set default subscript to 1
#if there is a subscript other than one
while ($AtomAndSub[$i] =~ /[0-9]+/g) {
#then use that as the subscript value
$tempSub = $&;
}
push @Subscript, $tempSub; #add substript to @Subscript
}
#now the atomic symbols and subscripts are broken up, return them as arrays
return (\@AtomicSym, \@Subscript); #return the two arrays
#this is actually a little complex in that it is not possible to return more than one array, so we are simply returning
#an array of array references.
}
#This subroutine takes the atomic symbols and subscripts from breakdownCF and does two important things: 1) it adds new atomic symbols
#to the header for the metabolites document @MetLabels, and to the hash %AtomicSym and 2) formats the row of subscripts for the present species
#inputs:
# $_[0] = \@MetLabels = reference to array of column labels for metabolite document. Will push new atomic symbols into it
# $_[1] = \%AtomicSymbols = reference to hash of atomic symbols, key is symbol, value is number order. Will add new atomic symbols
# $_[2] = \@AtomicSyms = array of atomic symbols present in the current species
# $_[3] = \@Subscripts = array of subscripts present in current species related to the atomic symbols
#outputs:
# \@newMetLabels = reference to updated @MetLabels array
# \%newAtomicSymbols = reference to updated %AtomicSymbols hash
# $formattedSubs = string of subscripts formatted so that it is ready to append to the current metabolite row being generated
sub updateSymsAndSubs {
$MetLabelsRef = $_[0]; #get the met labels reference
$AtomicSymbolsRef = $_[1]; #get the atomic symbols reference
$AtomicSymsRef = $_[2]; #get atomic syms for this species reference
$SubscriptsRef = $_[3]; #get subscripts for this species reference
my @newMetLabels = @$MetLabelsRef; #get array from reference
my %newAtomicSymbols = %$AtomicSymbolsRef; #get symbols has from reference
my @copyAtomicSyms = @$AtomicSymsRef;
my @copySubscripts = @$SubscriptsRef;
#creates an array in whic hto store what atoms are currently present
my @PresentAtoms = ( );
#output string defined here
$formattedSubs = "";
for (my $i = 0; $i <= $#copyAtomicSyms; $i++) { #foreach atomic symbol in the species
$TempAtomicSym = $copyAtomicSyms[$i];
#1) check to see if it exists the the atomic symbol hash, if not add it to the hash, and the the labels
if (! exists $newAtomicSymbols{$TempAtomicSym}) { #if the atomic symbol does not exist in the hash
$numberofKeys = keys %newAtomicSymbols;
$newAtomicSymbols{$TempAtomicSym} = $numberofKeys; #add new key to hash at the current number of keys
#indexing begins at 0 as usual
push @newMetLabels, $copyAtomicSyms[$i]; #add new atomic symbol to end of metabolic labels
}
#add the subscript to the present symbols array at the index specified by the atomic symbol hash key
#any species not present in the species will be undef, or in otherwords will not exist
$PresentAtoms[$newAtomicSymbols{$copyAtomicSyms[$i]}] = $copySubscripts[$i];
}
#2) format the row for the metabolites document
for (my $i = 0; $i <= $#PresentAtoms; $i++) {
#if the element in the present atoms array exists/is defined
if (exists $PresentAtoms[$i]) {
#then that atom is present and needs a subscript
$formattedSubs = $formattedSubs.$PresentAtoms[$i].",";
} else { #if it is not defined
#it does not have any atoms of that symbol
$formattedSubs = $formattedSubs.",";
}
}
#now the meat of the program is done, all that needs to be done is to return references to the updated hash and array
#and to return the formatted string.
return (\@newMetLabels, \%newAtomicSymbols, $formattedSubs);
}
#this subroutine is written to search the KEGG database for a specifica name. It also holds contingencies such as
#a hash for amino acids (since they are commonly listed by their three letter designation), and a contingency for
#protons (charge +1, formula hydrogen), proteins, polymers with a designated n-value, and finally a hash for common
#acronyms (as part of the amino acid hash)
#inputs:
# $speciesName = chemical species name
#outputs:
# $KEGGID = KEGG ID+ for chemical compound, format C[0-9]+ OR $speciesName if KEGGID is passed
# $speciesCharge = formal charge of chemical species
# $CF = chemical formula of the named compound
#call looks like: ($KEGGID, $speciesCharge, $CF) = &searchKEGGforCF($speciesName);
sub searchKEGGforCF {
#first, create a hash for 3 letter protein codes that often appear in chemical species,
#this will allow for easy lookup of amino acids. For most, a specific chirality is noted so that
#search parameters may fit more exactly, since chirality does not affect the molecular formula
#this hash gives KEGG ID, charge and molecular formula, allowing us to forgo a KEGG search
#the return string may be split using the split function with delimeter :
my %commonSpecies = (
"Ala" => "L-Alanine:C00041:0:C2H7NO2",
"Ile" => "L-Isoleucine:C00407:0:C6H13NO2",
"Leu" => "L-Leucine:C00123:0:C6H12NO2",
"Val" => "L-Valine:C00183:0:C5H11NO2",
"Phe" => "L-Phenylalanine:C00079:0:C9H11NO2",
"Trp" => "L-Tryptophan:C00078:0:C11H12N2O2",
"Tyr" => "L-Tyrosine:C00082:0:C9H11NO3",
"Asn" => "L-Asparagine:C00152:0:C4H8N203",
"Cys" => "L-Cysteine:C00097:0:C3H7NO2S",
"Gln" => "L-Glutamine:C00064:0:C5H10N2O3",
"Met" => "L-Methionine:C00073:0:C5H11NO2S",
"Ser" => "L-Serine:C00065:0:C3H7N03",
"Thr" => "L-Threonine:c00188:0:C4H9NO3",
"Asp" => "L-Aspartic Acid:C00049:0:C4H7NO4",
"Glu" => "L-Glutamic Acid:C00025:0:C5H9NO4",
"Arg" => "L-Arginine:C00062:0:C6H14N4O2",
"His" => "L-Histidine:C00135:0:C6H9N3O2",
"Lys" => "L-Lysine:C00047:0:C6H14N2O2",
"Gly" => "Glycine:C00037:0:C2H5N02",
"Pro" => "L-Proline:C00148:0:C5H9NO2",
"ATP" => "Adenosine 5'-triphosphate:C00002:0:C10H16N5O13P3",
"ADP" => "Adenosine 5'-diphosphate:C00008:0:C10H15N5O10P2",
"AMP" => "Adenosine 5'-monophosphate:C00020:0:C10H14N5O7P1",
"NADP+" => "NADP+:C00006:+1:C21H29N7O17P3",
"NADPH" => "NADPH:C00005:0:C21H30N7O17P3",
"H2O" => "Water:C00001:0:H2O",
"Ammonium" => "Ammonium:C01342:1:NH3",
"H+" => "Proton:C00080:1:H",
"H" => "Proton:C00080:1:H",
"H2SO4" => "Sulfuric acid:C00059:0:H2SO4",
"fa6" => "fa6:UNKNOWN:-1:C16H31O2",
"fa6coa" => "fa6:UNKNOWN:-1:C37H63N7O17P3S",
"HYXN" => "Hypoxanthine:C00262:0:C5H4N4O",
);
chomp($_[0]);
$speciesName = $_[0];
$KEGGID = "";
$speciesCharge = "0";
$CF = "";
#format the species name to something that is searchable
$searchName = &formatName($speciesName);
#give an output so the user does not become impatient and terminate early
printf "Searching KEGG for chemical species |%s|\n", $searchName;
#now before we get too involved, let us see if we can forgo a KEGG search, by avoiding searches on
#the provided chemical species by way of it being a proton, protein, or in common species hash
if ($searchName =~ /protein/i) { #if the species is named as a protien
#then it will not be able to be found in KEGG compound, or even have a meaningful molecular
#formula, as it may well vary on a species to species basis
$KEGGID = "PROTEIN";
$speciesCharge = "0";
$CF = "AA POLYMER";
return ($KEGGID, $speciesCharge, $CF);
} else { #check if in common species hash
#for each element in common species hash
foreach $key (sort keys %commonSpecies) {
#search to see if the name contains the hash key, but need to be careful so we just
#pick up those instances where a protien or a form of ATP was intended
#ATP/AMP/ADP will stand alone, however, it is possible that the two bound
#if AAs stand alone will likely be full name such as "L-Leucine"
#there are more sophisticated names with alternations of AA codes and molecule names,
#but I have to be realistic and stop looking for specialized occurances at some point
$hashOut = $commonSpecies{$key};
#if species contains just a three letter code for an AA
if ($searchName =~ /^$key$/) {
($speciesName, $KEGGID, $speciesCharge, $CF) = split /:/, $hashOut;
return ($KEGGID, $speciesCharge, $CF);
} elsif ($hashOut =~ /^$searchName$/i) { #if the hash value for the key contains the search name
#then the name fed must have been a full amino acid name, return the hash information
($speciesName, $KEGGID, $speciesCharge, $CF) = split /:/, $hashOut;
return ($KEGGID, $speciesCharge, $CF);
}
#otherwise, doesn't look like any common species are in here
}
}
#added section to handle if KEGGID is directly passed to this fuction
if ($searchName =~ /^C[0-9][0-9][0-9][0-9][0-9]$/) {
#we now know what page to go to, will save runtime
$UserAgent = LWP::UserAgent->new; #creates new user agent
$compoundURL = "http://rest.kegg.jp/get/".$&;
$compoundResults = $UserAgent->get($compoundURL);
$compoundResults = $compoundResults->content;
#if there is a chemical formula given, get everything on the formula line after formula followed by spaces (or a tab)
#see example page http://rest.kegg.jp/get/C00002 to see how I chose the regular expressions
#recall that the metacharacter '.' matches everything but newline characters
if ($compoundResults =~ /FORMULA\s+(.+)/) {
#what is captured by the parentheses is the desired info
$CF = $1;
chomp($CF);
}
if ($compoundResults =~ /ENTRY\s+(.+?)\s+/) {
#what is captured by the parentheses is the desired info
$KEGGID = $1;
chomp($KEGGID);
}
if ($compoundResults =~ /NAME\s+(.+)/) {
#what is captured by the parentheses is the desired info
$speciesName = $1;
#remove trailing semi-colon if has more than one name
$speciesName =~ s/;$//ig;
#replace commas, which could mess up the formatting
$speciesName =~ s/,/;/ig;
chomp($speciesName);
}
return ($speciesName, $speciesCharge, $CF);
}
#alright, since we've gotten here, we haven't got it anywhere convenient, so we are looking it up
#in the KEGG database
$searchResPage = "http://rest.kegg.jp/find/compound/".$searchName;
#create a new user agent which pulls the html source code for the KEGG search page
$UserAgent = LWP::UserAgent->new; #creates new user agent
$searchResponse = $UserAgent->get($searchResPage);
$searchResponse = $searchResponse->content;
#if the search result page contains the species name we are looking for, go to the page indicated by the compound ID and get all the
#necessary info. Multiple formats, hence the multiple conditions. The (+|-|) line allows for finding ions. ions from Model seed lack +/id
#symbol, but has number for charge still.
if ($searchResponse =~ /cpd:(C[0-9]+)\s+$searchName(\+|-|);/ig) { #first item in list
#get the compound name to access the KEGG API page
$compoundURL = "http://rest.kegg.jp/get/".$1;
$compoundResults = $UserAgent->get($compoundURL);
$compoundResults = $compoundResults->content;
#if there is a chemical formula given, get everything on the formula line after formula followed by spaces (or a tab)
#see example page http://rest.kegg.jp/get/C00002 to see how I chose the regular expressions
#recall that the metacharacter '.' matches everything but newline characters
if ($compoundResults =~ /FORMULA\s+(.+)/) {
#what is captured by the parentheses is the desired info
$CF = $1;
chomp($CF);
}
if ($compoundResults =~ /ENTRY\s+(.+?)\s+/) {
#what is captured by the parentheses is the desired info
$KEGGID = $1;
chomp($KEGGID);
}
if ($compoundResults =~ /NAME\s+(.+)/) {
#what is captured by the parentheses is the desired info
$speciesName = $1;
chomp($speciesName);
}
return ($KEGGID, $speciesCharge, $CF);
} elsif ($searchResponse =~ /cpd:(C[0-9]+)\s+.+?;\s$searchName(\+|-|);/ig) { #middle item in list
#get the compound name to access the KEGG API page
$compoundURL = "http://rest.kegg.jp/get/".$1;
$compoundResults = $UserAgent->get($compoundURL);
$compoundResults = $compoundResults->content;
#if there is a chemical formula given, get everything on the formula line after formula followed by spaces (or a tab)
#see example page http://rest.kegg.jp/get/C00002 to see how I chose the regular expressions
#recall that the metacharacter '.' matches everything but newline characters
if ($compoundResults =~ /FORMULA\s+(.+)/) {
#what is captured by the parentheses is the desired info
$CF = $1;
chomp($CF);
}
if ($compoundResults =~ /ENTRY\s+(.+?)\s+/) {
#what is captured by the parentheses is the desired info
$KEGGID = $1;
chomp($KEGGID);
}
if ($compoundResults =~ /NAME\s+(.+)/) {
#what is captured by the parentheses is the desired info
$speciesName = $1;
chomp($speciesName);
}
return ($KEGGID, $speciesCharge, $CF);
} elsif ($searchResponse =~ /cpd:(C[0-9]+)\s+.+?;\s$searchName(\+|-|)(\r|\n|$)/ig) { #last item in list
#get the compound name to access the KEGG API page
$compoundURL = "http://rest.kegg.jp/get/".$1;
$compoundResults = $UserAgent->get($compoundURL);
$compoundResults = $compoundResults->content;
#if there is a chemical formula given, get everything on the formula line after formula followed by spaces (or a tab)
#see example page http://rest.kegg.jp/get/C00002 to see how I chose the regular expressions
#recall that the metacharacter '.' matches everything but newline characters
if ($compoundResults =~ /FORMULA\s+(.+)/) {
#what is captured by the parentheses is the desired info
$CF = $1;
chomp($CF);
}
if ($compoundResults =~ /ENTRY\s+(.+?)\s+/) {
#what is captured by the parentheses is the desired info
$KEGGID = $1;
chomp($KEGGID);
}
if ($compoundResults =~ /NAME\s+(.+)/) {
#what is captured by the parentheses is the desired info
$speciesName = $1;
chomp($speciesName);
}
return ($KEGGID, $speciesCharge, $CF);
} elsif ($searchResponse =~ /cpd:(C[0-9]+)\s+$searchName(\+|-|)\n/ig) { #only item in list
#get the compound name to access the KEGG API page
$compoundURL = "http://rest.kegg.jp/get/".$1;
$compoundResults = $UserAgent->get($compoundURL);
$compoundResults = $compoundResults->content;
#if there is a chemical formula given, get everything on the formula line after formula followed by spaces (or a tab)
#see example page http://rest.kegg.jp/get/C00002 to see how I chose the regular expressions
#recall that the metacharacter '.' matches everything but newline characters
if ($compoundResults =~ /FORMULA\s+(.+)/) {
#what is captured by the parentheses is the desired info
$CF = $1;
chomp($CF);
}
if ($compoundResults =~ /ENTRY\s+(.+?)\s+/) {
#what is captured by the parentheses is the desired info
$KEGGID = $1;
chomp($KEGGID);
}
if ($compoundResults =~ /NAME\s+(.+)/) {
#what is captured by the parentheses is the desired info
$speciesName = $1;
chomp($speciesName);
}
return ($KEGGID, $speciesCharge, $CF);
}
#if we have made it to this point, we have been unable to find the KEGG page, so make a return that indicates this
$KEGGID = "NO PAGE FOUND";
$speciesCharge = " ";
$CF = " ";
return ($KEGGID, $speciesCharge, $CF);
}
#this subroutine is written to search the KEGG database for a specifica name to get out the KEGG ID. Pretty much a carbon
#copy of searchKEGGforCF, but only returns one variable, the KEGG ID
#inputs:
# $speciesName = chemical species name
#outputs:
# $KEGGID = KEGG ID+ for chemical compound, format C[0-9]+
#call looks like: $KEGGID = &searchKEGGforCF($speciesName);
sub searchKEGGforID {
#first, create a hash for 3 letter protein codes that often appear in chemical species,
#this will allow for easy lookup of amino acids. For most, a specific chirality is noted so that
#search parameters may fit more exactly, since chirality does not affect the molecular formula
#this hash gives KEGG ID, charge and molecular formula, allowing us to forgo a KEGG search
#the return string may be split using the split function with delimeter :
my %commonSpecies = (
"Ala" => "L-Alanine:C00041:0:C2H7NO2",
"Ile" => "L-Isoleucine:C00407:0:C6H13NO2",
"Leu" => "L-Leucine:C00123:0:C6H12NO2",
"Val" => "L-Valine:C00183:0:C5H11NO2",
"Phe" => "L-Phenylalanine:C00079:0:C9H11NO2",
"Trp" => "L-Tryptophan:C00078:0:C11H12N2O2",
"Tyr" => "L-Tyrosine:C00082:0:C9H11NO3",
"Asn" => "L-Asparagine:C00152:0:C4H8N203",
"Cys" => "L-Cysteine:C00097:0:C3H7NO2S",
"Gln" => "L-Glutamine:C00064:0:C5H10N2O3",
"Met" => "L-Methionine:C00073:0:C5H11NO2S",
"Ser" => "L-Serine:C00065:0:C3H7N03",
"Thr" => "L-Threonine:c00188:0:C4H9NO3",
"Asp" => "L-Aspartic Acid:C00049:0:C4H7NO4",
"Glu" => "L-Glutamic Acid:C00025:0:C5H9NO4",
"Arg" => "L-Arginine:C00062:0:C6H14N4O2",
"His" => "L-Histidine:C00135:0:C6H9N3O2",
"Lys" => "L-Lysine:C00047:0:C6H14N2O2",
"Gly" => "Glycine:C00037:0:C2H5N02",
"Pro" => "L-Proline:C00148:0:C5H9NO2",
"ATP" => "Adenosine 5'-triphosphate:C00002:0:C10H16N5O13P3",
"ADP" => "Adenosine 5'-diphosphate:C00008:0:C10H15N5O10P2",
"AMP" => "Adenosine 5'-monophosphate:C00020:0:C10H14N5O7P1",
"NADP+" => "NADP+:C00006:+1:C21H29N7O17P3",
"NADPH" => "NADPH:C00005:0:C21H30N7O17P3",
"H2O" => "Water:C00001:0:H2O",
"Ammonium" => "Ammonium:C01342:1:NH3",
"H\+" => "Proton:C00080:1:H",
"H" => "Proton:C00080:1:H",
"H2SO4" => "Sulfuric acid:C00059:0:H2SO4",
"fa6" => "fa6:UNKNOWN:-1:C16H31O2",
"fa6coa" => "fa6:UNKNOWN:-1:C37H63N7O17P3S",
"HYXN" => "Hypoxanthine:C00262:0:C5H4N4O",
);
chomp($_[0]);
$speciesName = $_[0];
$KEGGID = "";
$speciesCharge = "";
$CF = "";
#format the species name to something that is searchable
$searchName = &formatName($speciesName);
#give an output so the user does not become impatient and terminate early
printf "Searching KEGG for chemical species |%s|\n", $searchName;
#now before we get too involved, let us see if we can forgo a KEGG search, by avoiding searches on
#the provided chemical species by way of it being a proton, protein, or in common species hash
if ($searchName =~ /protein/i) { #if the species is named as a protien
#then it will not be able to be found in KEGG compound, or even have a meaningful molecular
#formula, as it may well vary on a species to species basis
$KEGGID = "PROTEIN";
$speciesCharge = "0";
$CF = "AA POLYMER";
return $KEGGID;
} else { #check if in common species hash
#for each element in common species hash
foreach $key (sort keys %commonSpecies) {
#search to see if the name contains the hash key, but need to be careful so we just
#pick up those instances where a protien or a form of ATP was intended
#ATP/AMP/ADP will stand alone, however, it is possible that the two bound
#if AAs stand alone will likely be full name such as "L-Leucine"
#there are more sophisticated names with alternations of AA codes and molecule names,
#but I have to be realistic and stop looking for specialized occurances at some point
$hashOut = $commonSpecies{$key};
#if species contains just a three letter code for an AA
if ($searchName =~ /^$key$/) {
($speciesName, $KEGGID, $speciesCharge, $CF) = split /:/, $hashOut;
return $KEGGID;
} elsif ($hashOut =~ /^$searchName$/i) { #if the hash value for the key contains the search name
#then the name fed must have been a full amino acid name, return the hash information
($speciesName, $KEGGID, $speciesCharge, $CF) = split /:/, $hashOut;
return $KEGGID;
}
#otherwise, doesn't look like any common species are in here
}
}
#alright, since we've gotten here, we haven't got it anywhere convenient, so we are looking it up
#in the KEGG database
$searchResPage = "http://rest.kegg.jp/find/compound/".$searchName;
#create a new user agent which pulls the html source code for the KEGG search page
$UserAgent = LWP::UserAgent->new; #creates new user agent
$searchResponse = $UserAgent->get($searchResPage);
$searchResponse = $searchResponse->content;
#if the search result page contains the species name we are looking for, go to the page indicated by the compound ID and get all the
#necessary info. Multiple formats, hence the multiple conditions. The (+|-|) line allows for finding ions. ions from Model seed lack +/id
#symbol, but has number for charge still.
if ($searchResponse =~ /cpd:(C[0-9]+)\s+$searchName(\+|-|);/ig) { #first item in list
#get the compound name to access the KEGG API page
$compoundURL = "http://rest.kegg.jp/get/".$1;
$compoundResults = $UserAgent->get($compoundURL);
$compoundResults = $compoundResults->content;
#if there is a chemical formula given, get everything on the formula line after formula followed by spaces (or a tab)
#see example page http://rest.kegg.jp/get/C00002 to see how I chose the regular expressions
#recall that the metacharacter '.' matches everything but newline characters
if ($compoundResults =~ /FORMULA\s+(.+)/) {
#what is captured by the parentheses is the desired info
$CF = $1;
chomp($CF);
}
if ($compoundResults =~ /ENTRY\s+(.+?)\s+/) {
#what is captured by the parentheses is the desired info
$KEGGID = $1;
chomp($KEGGID);
}
if ($compoundResults =~ /NAME\s+(.+)/) {
#what is captured by the parentheses is the desired info
$speciesName = $1;
chomp($speciesName);
}
return $KEGGID;
} elsif ($searchResponse =~ /cpd:(C[0-9]+)\s+.+?;\s$searchName(\+|-|);/ig) { #middle item in list
#get the compound name to access the KEGG API page
$compoundURL = "http://rest.kegg.jp/get/".$1;
$compoundResults = $UserAgent->get($compoundURL);
$compoundResults = $compoundResults->content;
#if there is a chemical formula given, get everything on the formula line after formula followed by spaces (or a tab)
#see example page http://rest.kegg.jp/get/C00002 to see how I chose the regular expressions
#recall that the metacharacter '.' matches everything but newline characters
if ($compoundResults =~ /FORMULA\s+(.+)/) {
#what is captured by the parentheses is the desired info
$CF = $1;
chomp($CF);
}
if ($compoundResults =~ /ENTRY\s+(.+?)\s+/) {
#what is captured by the parentheses is the desired info
$KEGGID = $1;
chomp($KEGGID);
}
if ($compoundResults =~ /NAME\s+(.+)/) {
#what is captured by the parentheses is the desired info
$speciesName = $1;
chomp($speciesName);
}
return $KEGGID;
} elsif ($searchResponse =~ /cpd:(C[0-9]+)\s+.+?;\s$searchName(\+|-|)(\r|\n|$)/ig) { #last item in list
#get the compound name to access the KEGG API page
$compoundURL = "http://rest.kegg.jp/get/".$1;
$compoundResults = $UserAgent->get($compoundURL);
$compoundResults = $compoundResults->content;
#if there is a chemical formula given, get everything on the formula line after formula followed by spaces (or a tab)
#see example page http://rest.kegg.jp/get/C00002 to see how I chose the regular expressions
#recall that the metacharacter '.' matches everything but newline characters
if ($compoundResults =~ /FORMULA\s+(.+)/) {
#what is captured by the parentheses is the desired info
$CF = $1;
chomp($CF);
}
if ($compoundResults =~ /ENTRY\s+(.+?)\s+/) {
#what is captured by the parentheses is the desired info
$KEGGID = $1;
chomp($KEGGID);
}
if ($compoundResults =~ /NAME\s+(.+)/) {
#what is captured by the parentheses is the desired info
$speciesName = $1;
chomp($speciesName);
}
return $KEGGID;
} elsif ($searchResponse =~ /cpd:(C[0-9]+)\s+$searchName(\+|-|)\n/ig) { #only item in list
#get the compound name to access the KEGG API page
$compoundURL = "http://rest.kegg.jp/get/".$1;
$compoundResults = $UserAgent->get($compoundURL);
$compoundResults = $compoundResults->content;
#if there is a chemical formula given, get everything on the formula line after formula followed by spaces (or a tab)
#see example page http://rest.kegg.jp/get/C00002 to see how I chose the regular expressions
#recall that the metacharacter '.' matches everything but newline characters
if ($compoundResults =~ /FORMULA\s+(.+)/) {
#what is captured by the parentheses is the desired info
$CF = $1;
chomp($CF);
}
if ($compoundResults =~ /ENTRY\s+(.+?)\s+/) {
#what is captured by the parentheses is the desired info
$KEGGID = $1;
chomp($KEGGID);
}
if ($compoundResults =~ /NAME\s+(.+)/) {
#what is captured by the parentheses is the desired info
$speciesName = $1;
chomp($speciesName);
}
return $KEGGID;
}
#if we have made it to this point, we have been unable to find the KEGG page, so make a return that indicates this
$KEGGID = "NO PAGE FOUND";
$speciesCharge = " ";
$CF = " ";
return $KEGGID;
}
#this subroutine is written to search the KEGG database with a KEGG ID to get out the name. Pretty much a barebones
#copy of searchKEGGforCF, only returning one variable, the name
#inputs:
# $compoundID = chemical species KEGG ID
#outputs:
# $compoundName = chemical species name
#call looks like: $compoundName = &searchKEGGforName($compoundID);
sub searchKEGGforName {
no warnings 'uninitialized';
chomp($_[0]);
$KEGGID = $_[0];
if ($KEGGID eq 'C00080') {
return "proton";
}
$speciesName = "";
$searchResPage = "http://rest.kegg.jp/get/".$KEGGID."/";
printf "search url: %s\n", $searchResPage;
#create a new user agent which pulls the html source code for the KEGG search page
$UserAgent = LWP::UserAgent->new; #creates new user agent
$searchResponse = $UserAgent->get($searchResPage);
$searchResponse = $searchResponse->content;
#search KEGG for species name
if ($searchResponse =~ /NAME\s+(.+)/) {
$speciesName = $1;
$speciesName =~ s/\t//g;
$speciesName =~ s/^\s//;
$speciesName =~ s/\s$//;
$speciesName =~ s/;$//;
}
#had to add a recursive call to this if there was a search page and the subroutine
#did not give back a name, for some reason this call kept timing out
if ((length $speciesName == 0) && (length $searchResponse != 0)) {
$speciesName = &searchKEGGforName($KEGGID);
}
return $speciesName;
}
#this subroutine was created to combat slow internet by making a recursive subroutine to perform internet searches
#to prevent timeout problems by running the request against
#inputs:
# $_[0] = url to query
#outputs:
# $page = page content results of internet search
sub queryInternet {
my $UserAgent = LWP::UserAgent->new; #creates new user agent
$url = $_[0]; #links to page
$page = $UserAgent->get($url); #gets all information
printf "query: %s\n", $url;
$page = $page->content; #filters information to just page contents
if ($page =~ /A connection attempt failed because the connected host has failed to respond/) {
$page = queryInternet($url);
}
return $page;
}
#this subroutine takes a line from an NCBI output from a protein details page. This is downloaded as is from the link "Download table",
#and then two operations are performed on it for ease of reading. 1) replace all commas with semicolons. 2)replace all tabs (\t) with
#commas then finally 3) save the table as a .csv. This transformation should make it easy to read the file, manually and by program
#table for E dermatitidis downloaded from URL: https://www.ncbi.nlm.nih.gov/genome/proteins/2962?genome_assembly_id=34221&gi=-1
#
#Additionally, it also works with UniProt data downloaded from the following URL
#http://www.uniprot.org/uniprot/?query=taxonomy%3A%22Exophiala+dermatitidis+%28strain+ATCC+34100+%2F+CBS+525.76+%2F+NIH%2FUT8656%29+%28Black+yeast%29+%28Wangiella+dermatitidis%29+%5B858893%5D%22&sort=score
#with the following provisos:
#1) before downloading table, ensure that a column is present for EC number (can be added by selecting
# the "columns" button above the table)
#2)Once downloaded, replace all commas with semicolons (necessary for csv formatting not to be thrown off)
#3)replace all tabs (\t) with commas (easily done in notepad ++)
#4) save the resultant file as a .csv
#this allows for easy of manual reading (as it may be opened in Excel) and ease of programmatic
#reading
#any organism should ideally have a similar page and table in both databases
#in both cases returns all EC numbers which are returned by the Brenda search, as they are returned in
#numerical order and therefore there is no way in which to establish a "best" result
#inputs:
# $_[0] = single line from NCBI csv file (after above transformation), from which the protein name will be taken
# $_[1] = column number of protein name
#output:
# $ECno = enzyme classification number for said protein, if available, or "not found" if unavailable
sub searchBrendaProtein {
$ECno = "unknown"; #establish default value of $ECno
@ECnos = ( ); #stores array of EC numbers if multiple search results
$line = $_[0]; #give the input a useful handle
chomp($line);
@cells = split /,/, $line; #split the cells into an array where individual cells may be accessed
#get the protien name, which should be present in the fourth cell (recall indexing begins at 0)
$proteinName = $cells[$_[1]];
#next code line is to replace all spaces with a plus sign for use in the url
#if it is a "hypothetical protein" (NCBI nomenclature) or a "putative uncharacterized protein" (UniProt Nomenclature)
if ($proteinName =~ /hypothetical protein/ig || $proteinName =~ /putative uncharacterized protein/ig) {
#return a value to indicate that we do not know the EC number. This is necessary because the serch string
#hypothetical protien
return $ECno;
}
$_ = $proteinName;
s/\s+/+/ig;
s/;/,/ig;
$searchName = $_;
#url for search result page with the correct protein added as search string. Now this URL does specify the
#number of results, but this seems to make no difference.
$url = "http://www.brenda-enzymes.org/search_result.php?quicksearch=1&noOfResults=10&a=9&W[2]=".$searchName."&T[2]=2";
#have an output that user can see, so that the user can track progress
printf "Searching Brenda for |%s|...", $proteinName;
$UserAgent = LWP::UserAgent->new; #creates new user agent
$searchResponse = $UserAgent->get($url);
$searchResponse = $searchResponse->content;
#search for return string which indicates no result from the Brenda search. The unique string is "Sorry, no results could be
#found for your query!" To save space in coding, I simply will search for the word "sorry, no results" as it is unique to no search results
if ($searchResponse =~ /Sorry, no results/ig) {
#no search results have been found, return unknown
printf "EC: %s\n", $ECno;
return $ECno;
}
#otherwise, we have an EC number!
#we need to get each EC returned by the Brenda search
#this line captures all instances in the HTML code of digit(s).digit(s).digit(s).digit(s) which is the format for EC numbers
#as far as I have seen, no other area of the Brenda HTML code has the same formatting. Unfortunately, there is significant
#redundancy in the HTML code, so there needs to be a system to only add unique EC numbers
while($searchResponse =~ /[0-9]+\.([0-9]+|-)\.([0-9]+|-)\.([0-9]+|-)/ig) {
#for each match, check if the matched EC number is already present in the array
no warnings;
if ($& ~~ @ECnos) {
#if it is already present, do nothing
} else {
#otherwise, add the new EC number to the array
push @ECnos, $&;
}
use warnings;
}
$ECno = join "|", @ECnos;
#now we have an array of EC numbers, which is difficult to pass, and would leave a file with an unknown number of columns
#we will join all EC numbers with a verticle bar for easy reading and so that we know that these are possible ec numbers
#as confirmed complexes of EC numbers are usually annotated with colons or semicolons
#shows the user the EC number found
printf "EC: %s\n", $ECno;
#return the EC number found
return $ECno;
}
#This fuction is designed to, given an EC number, search KEGG to get the reactions and metabolites associated with that EC number
#while the concept is simple, the application is long. This is basically a followup program to getting a list of unique enzyme classifications
#from an annotated genome, and turning that annotation into the beginnings of a model.
#inputs:
# $_[0] = EC number of the enzyme
#outputs:
# \@rxnInfo = a reference to an array of strings formatted for a CSV which contains reaction information
# \@metIDs = a reference to an array of metabolite KEGG IDs, will be used to search for metabolite information in another function
sub SearchKEGGEC {
$EC = $_[0];
#create a user agent
$UserAgent = LWP::UserAgent->new; #creates new user agent
#go to to URL
$url = "http://rest.kegg.jp/get/".$EC;
$ECpage = $UserAgent->get($url);
#get the HTML contents of the page
$ECpage = $ECpage->content;
#since this will take a while cumulatively, allow an output to the user to know this program is still running
printf "searching for reaction associated with EC:%s\n",$EC;
#okay, so now we have the source code for the page, pull all reaction IDs
my @rxnIDs = ( ); #array to store reaction numbers we find
my @metIDs = ( ); #array to store metabolite IDs
my @rxnInfo = ( );
#so, how this will work is we seach for all instances of R[0-9][0-9][0-9][0-9][0-9] (that is RXXXXX where X is a digit)
#then, it will check that match against @rxnNumbers, if there is a match, then we will not add it (it is already there). If there
#is not a match, we will push it into @rxnNumbers. We should also avoid general reactions that will have such generalities as "a primary alcohol"
while($ECpage =~ /R[0-9][0-9][0-9][0-9][0-9](\s|;)/g) {
if (grep(/^$&$/, @rxnIDs)) {
#do nothing, reaction ID already present
} else {
#add reaction ID to the list, after formatting
$tempRxn = $&;
chomp($tempRxn);
$tempRxn =~ s/;//ig;
push @rxnIDs, $tempRxn;
}
}
#at this point we should have an array of reaction IDs. Now look them up, then use that information to create the written lines of
#information taht we are trying to return. Since this bit will take a while (cumulatively), allow an output to notify the user that
#the program is still running
for (my $i = 0; $i <= $#rxnIDs; $i++) {
printf "Getting information for reaction ID %s\n", $rxnIDs[$i];
$url = "http://rest.kegg.jp/get/".$rxnIDs[$i];
$RxnPage = $UserAgent->get($url);
$RxnPage = $RxnPage->content;
$RxnWritten = " ";
#while there are more compound IDs to find on the page (all compound IDs part of reaction line)
#OR while there are more arrows (actually should only be one, but still make sure to grab it)
# note that < is the html character for <
#OR while there are more plus signs
#OR while there are more stoichiometric coefficients
if ($RxnPage =~ /EQUATION\s+(.*)/) {
#what was captured is the equation
$RxnWritten = $1;
#now, we need to add 1's where appropriate for GAMS to be able to handle our output
#first, if the compound is anchored at the front of the written reaction, add a "1 "
if ($RxnWritten =~ /^C/) {
$RxnWritten = "1 ".$RxnWritten;
}
#second, if there is a plus and a compound without a number between, add a 1
while($RxnWritten =~ /\+\sC/) {
#we need to add a 1 between the space and the plus
#so, use indexing and substrings to split it
$splitPoint = index($RxnWritten, "+ C") + 1;
$RxnWritten = substr($RxnWritten, 0, $splitPoint)." 1".substr($RxnWritten, $splitPoint, length($RxnWritten));
}
#third, if the compount after the arrow does not have a 1
if ($RxnWritten =~ /=>\sC/) {
#split and add the 1
$splitPoint = index($RxnWritten, "=> C") + 2;
$RxnWritten = substr($RxnWritten, 0, $splitPoint)." 1".substr($RxnWritten, $splitPoint, length($RxnWritten));
}
#finally, push all compound id's found into @metIDs
while($RxnWritten =~ /C[0-9][0-9][0-9][0-9][0-9]/ig) {
push @metIDs, $&;
}
}
chomp($RxnWritten);
#now we have the reaction written. let's create the written line which is "EC number, Rxn ID, Rxn Written"
$rxnIDs[$i] =~ s/\s$//ig;
$writtenRxnLine = $EC.",".$rxnIDs[$i].",".$RxnWritten.",";
push @rxnInfo, $writtenRxnLine;
}
#in here, we will not search for required info for metabolites. There will be a lot of overlap and redundancy if this is done, and personally
#I'd rather not waste the CPU time to search for "C00001" (water) a thousand times
#return the array references
return \@rxnInfo, \@metIDs;
}
#This fuction is designed to, given an EC number, search KEGG to get the reactions and metabolites associated with that EC number
#while the concept is simple, the application is long. This is basically a followup program to getting a list of unique enzyme classifications
#from an annotated genome, and turning that annotation into the beginnings of a model.
#inputs:
# $_[0] = EC number of the enzyme
# #_[1] = compartment in which the reaction is occurning
#outputs:
# \@rxnInfo = a reference to an array of strings formatted for a CSV which contains reaction information
# \@metIDs = a reference to an array of metabolite KEGG IDs, will be used to search for metabolite information in another function
sub SearchKEGGECwCOMP {
$EC = $_[0];
$compartment = $_[1];
#create a user agent
$UserAgent = LWP::UserAgent->new; #creates new user agent
#go to to URL
$url = "http://rest.kegg.jp/get/".$EC;
$ECpage = $UserAgent->get($url);
#get the HTML contents of the page
$ECpage = $ECpage->content;
#since this will take a while cumulatively, allow an output to the user to know this program is still running
printf "searching for reaction associated with EC:%s\n",$EC;
#okay, so now we have the source code for the page, pull all reaction IDs
my @rxnIDs = ( ); #array to store reaction numbers we find
my @metIDs = ( ); #array to store metabolite IDs
my @rxnInfo = ( );
#so, how this will work is we seach for all instances of R[0-9][0-9][0-9][0-9][0-9] (that is RXXXXX where X is a digit)
#then, it will check that match against @rxnNumbers, if there is a match, then we will not add it (it is already there). If there
#is not a match, we will push it into @rxnNumbers. We should also avoid general reactions that will have such generalities as "a primary alcohol"
while($ECpage =~ /R[0-9][0-9][0-9][0-9][0-9](\s|;)/g) {
if (grep(/^$&$/, @rxnIDs)) {
#do nothing, reaction ID already present
} else {
#add reaction ID to the list, after formatting
$tempRxn = $&;
chomp($tempRxn);
$tempRxn =~ s/;//ig;
push @rxnIDs, $tempRxn;
}
}
#at this point we should have an array of reaction IDs. Now look them up, then use that information to create the written lines of
#information taht we are trying to return. Since this bit will take a while (cumulatively), allow an output to notify the user that
#the program is still running
for (my $i = 0; $i <= $#rxnIDs; $i++) {
printf "Getting information for reaction ID %s\n", $rxnIDs[$i];
$url = "http://rest.kegg.jp/get/".$rxnIDs[$i];
$RxnPage = $UserAgent->get($url);
$RxnPage = $RxnPage->content;
$RxnWritten = " ";
#while there are more compound IDs to find on the page (all compound IDs part of reaction line)
#OR while there are more arrows (actually should only be one, but still make sure to grab it)
# note that < is the html character for <
#OR while there are more plus signs
#OR while there are more stoichiometric coefficients
if ($RxnPage =~ /EQUATION\s+(.*)/) {
#what was captured is the equation
$RxnWritten = $1;
#now, we need to add 1's where appropriate for GAMS to be able to handle our output
#first, if the compound is anchored at the front of the written reaction, add a "1 "
if ($RxnWritten =~ /^C/) {
$RxnWritten = "1 ".$RxnWritten;
}
#second, if there is a plus and a compound without a number between, add a 1
while($RxnWritten =~ /\+\sC/) {
#we need to add a 1 between the space and the plus
#so, use indexing and substrings to split it
$splitPoint = index($RxnWritten, "+ C") + 1;
$RxnWritten = substr($RxnWritten, 0, $splitPoint)." 1".substr($RxnWritten, $splitPoint, length($RxnWritten));
}
#third, if the compount after the arrow does not have a 1
if ($RxnWritten =~ /=>\sC/) {
#split and add the 1
$splitPoint = index($RxnWritten, "=> C") + 2;
$RxnWritten = substr($RxnWritten, 0, $splitPoint)." 1".substr($RxnWritten, $splitPoint, length($RxnWritten));
}
#finally, push all compound id's found into @metIDs
while($RxnWritten =~ /C[0-9][0-9][0-9][0-9][0-9]/ig) {
#add compartment information
my $match = $&;
my $match_w_comp = $match.$compartment;
push @metIDs, $match_w_comp;
}
}
my $temp_rxn_writ = $RxnWritten;
#finally, add compartment to each metabolites
while($RxnWritten =~ /C\d\d\d\d\d/g) {
my $temp = $&;
$temp_rxn_writ =~ s/$temp/$temp$compartment/g;
}
#reformat arrow
$temp_rxn_writ =~ s/\=/\-/;
$RxnWritten = $temp_rxn_writ;
chomp($RxnWritten);
#now we have the reaction written. let's create the written line which is "EC number, Rxn ID, Rxn Written"
$rxnIDs[$i] =~ s/\s$//ig;
$writtenRxnLine = $rxnIDs[$i].$compartment."\t".$RxnWritten;
push @rxnInfo, $writtenRxnLine;
}
#in here, we will not search for required info for metabolites. There will be a lot of overlap and redundancy if this is done, and personally
#I'd rather not waste the CPU time to search for "C00001" (water) a thousand times
#return the array references
return \@rxnInfo, \@metIDs;
}
#This function will remove redundant entires in an array, and return a reference to a non redundant array.
#inputs:
# $_[0]=$@arrayRef = a reference to an array suspected of having redundancies
#outputs:
# \@nr = reference to an array without those redundancies
sub RemoveRedundancies {
$arrayRef = $_[0];
@array = @$arrayRef; #input array
%arrayKey = ( ); #allows us to check if a value exists in the array already. This is the inverse function of the non-redundant
@nr = ( ); #this is the non-redundant array
for (my $i=0; $i<=$#array; $i++) {
if (! exists $arrayKey{$array[$i]}) { #if the value does not have a key in the array
$arrayKey{$array[$i]} = $#nr + 1; #add the value and the key to the hash, so it is known that this value is now in the array
push @nr, $array[$i];
} #otherwise do nothing, already in the array
}
return \@nr;
}
#This function will take a KEGG compound ID, and return the name and chemical formula
#inputs:
# $_[0]=$nrMetsRef = This is a reference to an array of KEGG IDs of desired compounds (preferrably non-reduntant. see previous sub)
#outputs:
# \@metInfo = reference to array of strings formatted for output to a csv. contains "KEGG ID, Name, Chemical Formula,"
sub MetInfoFromID {
$MetsRef = $_[0];
@Mets = @$MetsRef;
#metabolites header
my @MetLabels = ( "KEGG ID", "Name", "Chemical Formula"); #atomic symbols will be added as columns
#however, for S, a sense of order is important (hash is more like a barrel than a linear array)
#therefore a number to hash key is stored in this array
my @MetKeys = ( );
#stores atomic symbols order, the key is the atomic symbol, the value is the "place" of that symbol