diff --git a/.rubocop_todo.yml b/.rubocop_todo.yml index 6bbd5536fa..b937fa1cc2 100644 --- a/.rubocop_todo.yml +++ b/.rubocop_todo.yml @@ -1,6 +1,6 @@ # This configuration was generated by # `rubocop --auto-gen-config --no-exclude-limit` -# on 2025-11-26 13:13:53 UTC using RuboCop version 1.81.7. +# on 2025-11-28 17:57:54 UTC using RuboCop version 1.81.7. # The point is for the user to remove these configuration records # one by one as the offenses are removed from the code base. # Note that changes in the inspected code, or installation of new @@ -299,7 +299,7 @@ Lint/UselessOr: - 'app/models/qcable/statemachine.rb' - 'app/models/ui_helper/summary.rb' -# Offense count: 6 +# Offense count: 7 # Configuration parameters: AllowedMethods, AllowedPatterns, CountRepeatedAttributes, Max. Metrics/AbcSize: Exclude: @@ -307,6 +307,7 @@ Metrics/AbcSize: - 'app/jobs/export_pool_xp_to_traction_job.rb' - 'app/models/accession_service/base_service.rb' - 'app/sample_manifest_excel/sample_manifest_excel/manifest_type_list.rb' + - 'lib/insdc/import_countries.rb' # Offense count: 1 # Configuration parameters: CountComments, Max, CountAsOne, AllowedMethods, AllowedPatterns, inherit_mode. @@ -329,7 +330,7 @@ Metrics/CyclomaticComplexity: - 'app/models/accession_service/base_service.rb' - 'lib/limber/helper.rb' -# Offense count: 13 +# Offense count: 14 # Configuration parameters: CountComments, Max, CountAsOne, AllowedMethods, AllowedPatterns. Metrics/MethodLength: Exclude: @@ -340,6 +341,7 @@ Metrics/MethodLength: - 'app/models/plate/quad_creator.rb' - 'app/uat_actions/uat_actions/tube_submission.rb' - 'app/validators/novaseqx_pe_validator.rb' + - 'lib/insdc/import_countries.rb' - 'lib/record_loader/application_record_loader.rb' - 'test/unit/import_fluidigm_data_test.rb' diff --git a/app/models/insdc/country.rb b/app/models/insdc/country.rb index 0da72bce78..0f8daa6cbb 100644 --- a/app/models/insdc/country.rb +++ b/app/models/insdc/country.rb @@ -48,6 +48,10 @@ def self.options sorted_for_select.pluck(:name) end + def valid! + update!(validation_state: :valid) + end + def invalid! update!(validation_state: :invalid) end diff --git a/config/accession/tags.yml b/config/accession/tags.yml index 801a60a3bb..3908fe3ed5 100644 --- a/config/accession/tags.yml +++ b/config/accession/tags.yml @@ -1,3 +1,20 @@ +# Tags configuration for Accession::TagList +# +# Tag/field requirements, naming, and validation can be found in the checklists listed below: +# +# - ENA: https://www.ebi.ac.uk/ena/browser/view/ERC000011 +# - EGA: https://www.ebi.ac.uk/ena/browser/view/ERC000026 +# - ArrayExpress: From September 2022, the interface to ArrayExpress was retired: +# https://www.ebi.ac.uk/about/news/updates-from-data-resources/arrayexpress-retired/ +# However, it appears that as of February 2017, ENA was submitting samples to ArrayExpress on our behalf. +# See this sample for an example: +# 1. https://sequencescape.psd.sanger.ac.uk/samples/2934454 +# 2. https://www.ebi.ac.uk/ena/browser/view/ERS1545406 +# 3. https://www.ebi.ac.uk/biosamples/samples/SAMEA80468668 +# How samples are being submitted to ArrayExpress/BioSample now is currently unknown. +# +# Local copies are also stored in `data/sample_checklists/` for reference purposes as they do change occasionally. + sample_taxon_id: services: - :ENA @@ -26,6 +43,13 @@ date_of_sample_collection: groups: - :sample_attributes ebi_name: :collection_date +sample_description: + services: + - :ENA + - :EGA + groups: + - :sample_attributes + ebi_name: :sample description donor_id: services: - :EGA @@ -46,6 +70,13 @@ phenotype: groups: - :sample_attributes - :array_express +sample_strain_att: + services: + - :ENA + - :EGA + groups: + - :sample_attributes + ebi_name: :strain strain_or_line: groups: - :array_express diff --git a/config/locales/metadata/en.yml b/config/locales/metadata/en.yml index 76d5e1c171..08f1fbd67e 100644 --- a/config/locales/metadata/en.yml +++ b/config/locales/metadata/en.yml @@ -174,12 +174,14 @@ en: gender: label: Gender - edit_info: "Array Express" + help: "EGA is limited to Male, Female, or Unknown." + edit_info: "EGA, Array Express" country_of_origin: label: Country of origin help: The geographic origin of the sample, as defined by the INSDC list of countries and seas. May also be 'not provided', 'not applicable', 'not collected' or 'restricted access'. Please see the missing value documentation before providing one of these values. accessioning_tag: geographic location (country and/or sea) + edit_info: "ENA requirement" geographical_region: label: Geographical region @@ -252,6 +254,7 @@ en: label: Date of sample collection help: Dates should be in the format YYYY-MM-DDTHH:MM:SS (eg. '2020-01-25T12:59:59') with each additional level of precision being optional. May also be 'not provided', 'not collected' or 'restricted access'. Please see the missing value documentation before providing one of these values. accessioning_tag: collection_date + edit_info: "ENA requirement" date_of_sample_extraction: label: Date of sample extraction @@ -280,7 +283,7 @@ en: donor_id: label: Donor Id ena_label: subject_id - edit_info: "EGA (As subject id)" + edit_info: "EGA recommended (As subject id)" subject_id: label: subject_id @@ -291,7 +294,7 @@ en: edit_info: "Array Express" phenotype: label: Phenotype - edit_info: "Array Express" + edit_info: "EGA requirement, Array Express" strain_or_line: label: Strain or Line edit_info: "Array Express" diff --git a/data/ena_sample_checklists/.gitignore b/data/ena_sample_checklists/.gitignore deleted file mode 100644 index 8a147fd1c5..0000000000 --- a/data/ena_sample_checklists/.gitignore +++ /dev/null @@ -1,3 +0,0 @@ -# We might be able to store these in the repo, but need to double check licenses -*.xml -!ERC000011.xml diff --git a/data/ena_sample_checklists/ERC000011.xml b/data/ena_sample_checklists/ERC000011.xml deleted file mode 100644 index 9739dd8816..0000000000 --- a/data/ena_sample_checklists/ERC000011.xml +++ /dev/null @@ -1,1213 +0,0 @@ - - - - - ERC000011 - - - - ENA default sample checklist - Minimum information required for the sample - ENA - - Part and developmental stage of organism - Anatomical and developmental descriptions of the sample site or source material - - - cell_type - cell type from which the sample was obtained - - - - optional - multiple - - - - dev_stage - if the sample was obtained from an organism in a specific developmental stage, it is specified with this qualifier - - - - optional - multiple - - - - germline - the sample described presented in the entry has not undergone somatic genomic rearrangement as part of an adaptive immune response; it is the unrearranged molecule that was inherited from the parental germline - - - - optional - multiple - - - - tissue_lib - tissue library from which sample was obtained - - - - optional - multiple - - - - tissue_type - tissue type from which the sample was obtained - - - - optional - multiple - - - - Collection event information - - - isolation_source - describes the physical, environmental and/or local geographical source of the biological sample from which the sample was derived - - - - optional - multiple - - - - lat_lon - geographical coordinates of the location where the specimen was collected - - - - optional - multiple - - - - collected_by - name of persons or institute who collected the specimen - - - - optional - multiple - - - - Event Date/Time - collection_date - collection date - The date the sample was collected with the intention of sequencing, either as an instance (single point in time) or interval. In case no exact time is available, the date/time can be right truncated i.e. all of these are valid ISO8601 compliant times: 2008-01-23T19:23:10+00:00; 2008-01-23T19:23:10; 2008-01-23; 2008-01; 2008. - - - (^[12][0-9]{3}(-(0[1-9]|1[0-2])(-(0[1-9]|[12][0-9]|3[01])(T[0-9]{2}:[0-9]{2}(:[0-9]{2})?Z?([+-][0-9]{1,2})?)?)?)?(/[0-9]{4}(-[0-9]{2}(-[0-9]{2}(T[0-9]{2}:[0-9]{2}(:[0-9]{2})?Z?([+-][0-9]{1,2})?)?)?)?)?$)|(^not collected$)|(^not provided$)|(^restricted access$)|(^missing: control sample$)|(^missing: sample group$)|(^missing: synthetic construct$)|(^missing: lab stock$)|(^missing: third party data$)|(^missing: data agreement established pre-2023$)|(^missing: endangered species$)|(^missing: human-identifiable$) - - - mandatory - multiple - - - - geographic location (country and/or sea) - The location the sample was collected from with the intention of sequencing, as defined by the country or sea. Country or sea names should be chosen from the INSDC country list (http://insdc.org/country.html). - - - - Afghanistan - - - Albania - - - Algeria - - - American Samoa - - - Andorra - - - Angola - - - Anguilla - - - Antarctica - - - Antigua and Barbuda - - - Arctic Ocean - - - Argentina - - - Armenia - - - Aruba - - - Ashmore and Cartier Islands - - - Atlantic Ocean - - - Australia - - - Austria - - - Azerbaijan - - - Bahamas - - - Bahrain - - - Baker Island - - - Baltic Sea - - - Bangladesh - - - Barbados - - - Bassas da India - - - Belarus - - - Belgium - - - Belize - - - Benin - - - Bermuda - - - Bhutan - - - Bolivia - - - Borneo - - - Bosnia and Herzegovina - - - Botswana - - - Bouvet Island - - - Brazil - - - British Virgin Islands - - - Brunei - - - Bulgaria - - - Burkina Faso - - - Burundi - - - Cambodia - - - Cameroon - - - Canada - - - Cape Verde - - - Cayman Islands - - - Central African Republic - - - Chad - - - Chile - - - China - - - Christmas Island - - - Clipperton Island - - - Cocos Islands - - - Colombia - - - Comoros - - - Cook Islands - - - Coral Sea Islands - - - Costa Rica - - - Cote d'Ivoire - - - Croatia - - - Cuba - - - Curacao - - - Cyprus - - - Czech Republic - - - Democratic Republic of the Congo - - - Denmark - - - Djibouti - - - Dominica - - - Dominican Republic - - - East Timor - - - Ecuador - - - Egypt - - - El Salvador - - - Equatorial Guinea - - - Eritrea - - - Estonia - - - Ethiopia - - - Europa Island - - - Falkland Islands (Islas Malvinas) - - - Faroe Islands - - - Fiji - - - Finland - - - France - - - French Guiana - - - French Polynesia - - - French Southern and Antarctic Lands - - - Gabon - - - Gambia - - - Gaza Strip - - - Georgia - - - Germany - - - Ghana - - - Gibraltar - - - Glorioso Islands - - - Greece - - - Greenland - - - Grenada - - - Guadeloupe - - - Guam - - - Guatemala - - - Guernsey - - - Guinea - - - Guinea-Bissau - - - Guyana - - - Haiti - - - Heard Island and McDonald Islands - - - Honduras - - - Hong Kong - - - Howland Island - - - Hungary - - - Iceland - - - India - - - Indian Ocean - - - Indonesia - - - Iran - - - Iraq - - - Ireland - - - Isle of Man - - - Israel - - - Italy - - - Jamaica - - - Jan Mayen - - - Japan - - - Jarvis Island - - - Jersey - - - Johnston Atoll - - - Jordan - - - Juan de Nova Island - - - Kazakhstan - - - Kenya - - - Kerguelen Archipelago - - - Kingman Reef - - - Kiribati - - - Kosovo - - - Kuwait - - - Kyrgyzstan - - - Laos - - - Latvia - - - Lebanon - - - Lesotho - - - Liberia - - - Libya - - - Liechtenstein - - - Lithuania - - - Luxembourg - - - Macau - - - Macedonia - - - Madagascar - - - Malawi - - - Malaysia - - - Maldives - - - Mali - - - Malta - - - Marshall Islands - - - Martinique - - - Mauritania - - - Mauritius - - - Mayotte - - - Mediterranean Sea - - - Mexico - - - Micronesia - - - Midway Islands - - - Moldova - - - Monaco - - - Mongolia - - - Montenegro - - - Montserrat - - - Morocco - - - Mozambique - - - Myanmar - - - Namibia - - - Nauru - - - Navassa Island - - - Nepal - - - Netherlands - - - New Caledonia - - - New Zealand - - - Nicaragua - - - Niger - - - Nigeria - - - Niue - - - Norfolk Island - - - North Korea - - - North Sea - - - Northern Mariana Islands - - - Norway - - - Oman - - - Pacific Ocean - - - Pakistan - - - Palau - - - Palmyra Atoll - - - Panama - - - Papua New Guinea - - - Paracel Islands - - - Paraguay - - - Peru - - - Philippines - - - Pitcairn Islands - - - Poland - - - Portugal - - - Puerto Rico - - - Qatar - - - Republic of the Congo - - - Reunion - - - Romania - - - Ross Sea - - - Russia - - - Rwanda - - - Saint Helena - - - Saint Kitts and Nevis - - - Saint Lucia - - - Saint Pierre and Miquelon - - - Saint Vincent and the Grenadines - - - Samoa - - - San Marino - - - Sao Tome and Principe - - - Saudi Arabia - - - Senegal - - - Serbia - - - Seychelles - - - Sierra Leone - - - Singapore - - - Sint Maarten - - - Slovakia - - - Slovenia - - - Solomon Islands - - - Somalia - - - South Africa - - - South Georgia and the South Sandwich Islands - - - South Korea - - - Southern Ocean - - - Spain - - - Spratly Islands - - - Sri Lanka - - - Sudan - - - Suriname - - - Svalbard - - - Swaziland - - - Sweden - - - Switzerland - - - Syria - - - Taiwan - - - Tajikistan - - - Tanzania - - - Tasman Sea - - - Thailand - - - Togo - - - Tokelau - - - Tonga - - - Trinidad and Tobago - - - Tromelin Island - - - Tunisia - - - Turkey - - - Turkmenistan - - - Turks and Caicos Islands - - - Tuvalu - - - USA - - - Uganda - - - Ukraine - - - United Arab Emirates - - - United Kingdom - - - Uruguay - - - Uzbekistan - - - Vanuatu - - - Venezuela - - - Viet Nam - - - Virgin Islands - - - Wake Island - - - Wallis and Futuna - - - West Bank - - - Western Sahara - - - Yemen - - - Zambia - - - Zimbabwe - - - missing: control sample - - - missing: data agreement established pre-2023 - - - missing: endangered species - - - missing: human-identifiable - - - missing: lab stock - - - missing: sample group - - - missing: synthetic construct - - - missing: third party data - - - not applicable - - - not collected - - - not provided - - - restricted access - - - - mandatory - multiple - - - - geographic location (region and locality) - The geographical origin of the sample as defined by the specific region name followed by the locality name. - - - - optional - multiple - - - - identified_by - name of the expert who identified the specimen taxonomically - - - - optional - multiple - - - - sample collection - - - environmental_sample - identifies sequences derived by direct molecular isolation from a bulk environmental DNA sample (by PCR with or without subsequent cloning of the product, DGGE, or other anonymous methods) with no reliable identification of the source organism - - - - No - - - Yes - - - - optional - multiple - - - - Organism characteristics - Characteristics of the source organism - - - mating_type - mating type of the organism from which the sequence was obtained; mating type is used for prokaryotes, and for eukaryotes that undergo meiosis without sexually dimorphic gametes - - - - optional - multiple - - - - sex - sex of the organism from which the sample was obtained - - - - optional - multiple - - - - host description - - - lab_host - scientific name of the laboratory host used to propagate the source organism from which the sample was obtained - - - - optional - multiple - - - - specific host - host scientific name - Scientific name of the natural (as opposed to laboratory) host to the organism from which sample was obtained. - - - - optional - multiple - - - - Pointer to physical material - References to sample or sample source material in physical resources - - - bio_material - Unique identifier that references the biological material from which the sample was obtained and that ideally exists in a curated collection (e.g. stock centres, seed banks, DNA banks). The ID should have the following structure: name of the institution (institution code) followed by the collection code (if available) and the voucher id (institution_code:collection_code:voucher_id). Please note institution codes and collection codes are taken from a controlled vocabulary maintained by the INSDC: https://ftp.ncbi.nih.gov/pub/taxonomy/biocollections/ - - - - optional - multiple - - - - culture_collection - Unique identifier that references the culture (e.g. live microbial and viral cultures and cell lines) from which the sample has been obtained and that have been deposited in curated culture collections. The ID needs to provide an institution code and the culture id, with optional collection code, in the following structure: (-institution_code:(collection_code):voucher_id. Please note institution codes (and optional collection codes) are taken from a controlled vocabulary maintained by the INSDC: https://ftp.ncbi.nih.gov/pub/taxonomy/biocollections/ - - - - optional - multiple - - - - specimen_voucher - Unique identifier that references the physical specimen that remains after the sequence has been obtained and that ideally exists in a curated collection. The ID should have the following structure: name of the institution (institution code) followed by the collection code (if available) and the voucher id (institution_code:collection_code:voucher_id). Please note institution codes and collection codes are taken from a controlled vocabulary maintained by the INSDC: https://ftp.ncbi.nih.gov/pub/taxonomy/biocollections/ - - - - optional - multiple - - - - Infraspecies information - Formal and informal infraspecies taxonomic information - - - cultivar - cultivar (cultivated variety) of plant from which sample was obtained - - - - optional - multiple - - - - ecotype - a population within a given species displaying genetically based, phenotypic traits that reflect adaptation to a local habitat. - - - - optional - multiple - - - - isolate - individual isolate from which the sample was obtained - - - - optional - multiple - - - - sub_species - name of sub-species of organism from which sample was obtained - - - - optional - multiple - - - - variety - variety (= varietas, a formal Linnaean rank) of organism from which sample was derived. - - - - optional - multiple - - - - sub_strain - name or identifier of a genetically or otherwise modified strain from which sample was obtained, derived from a parental strain (which should be annotated in the strain field; sub_strain from which sample was obtained - - - - optional - multiple - - - - cell_line - cell line from which the sample was obtained - - - - optional - multiple - - - - serotype - serological variety of a species characterized by its antigenic properties - - - - optional - multiple - - - - serovar - serological variety of a species (usually a prokaryote) characterized by its antigenic properties - - - - optional - multiple - - - - strain - Name of the strain from which the sample was obtained. - - - - optional - multiple - - - - - diff --git a/data/sample_checklists/.gitignore b/data/sample_checklists/.gitignore new file mode 100644 index 0000000000..9c942dd370 --- /dev/null +++ b/data/sample_checklists/.gitignore @@ -0,0 +1,4 @@ +# Terms of use: https://www.ebi.ac.uk/about/terms-of-use/ +*.xml +!ERC000011.xml +!ERC000026.xml diff --git a/data/sample_checklists/ERC000011.xml b/data/sample_checklists/ERC000011.xml new file mode 100644 index 0000000000..8a953ca8c2 --- /dev/null +++ b/data/sample_checklists/ERC000011.xml @@ -0,0 +1,1227 @@ + + + + + ERC000011 + + + + ENA default sample checklist + Minimum information required for the sample + ENA + + Part and developmental stage of organism + + + cell_type + cell type from which the sample was obtained + + + + optional + single + + + + dev_stage + if the sample was obtained from an organism in a specific developmental stage, it is specified with this qualifier + + + + optional + single + + + + germline + the sample described presented in the entry has not undergone somatic genomic rearrangement as part of an adaptive immune response; it is the unrearranged molecule that was inherited from the parental germline + + + + optional + single + + + + tissue_lib + tissue library from which sample was obtained + + + + optional + single + + + + tissue_type + tissue type from which the sample was obtained + + + + optional + single + + + + Collection event information + + + isolation_source + describes the physical, environmental and/or local geographical source of the biological sample from which the sample was derived + + + + optional + single + + + + lat_lon + geographical coordinates of the location where the specimen was collected + + + + optional + single + + + + collected_by + name of persons or institute who collected the specimen + + + + optional + single + + + + collection_date + The date the sample was collected with the intention of sequencing, either as an instance (single point in time) or interval. In case no exact time is available, the date/time can be right truncated i.e. all of these are valid ISO8601 compliant times: 2008-01-23T19:23:10+00:00; 2008-01-23T19:23:10; 2008-01-23; 2008-01; 2008. + + + (^[12][0-9]{3}(-(0[1-9]|1[0-2])(-(0[1-9]|[12][0-9]|3[01])(T[0-9]{2}:[0-9]{2}(:[0-9]{2})?Z?([+-][0-9]{1,2})?)?)?)?(/[0-9]{4}(-[0-9]{2}(-[0-9]{2}(T[0-9]{2}:[0-9]{2}(:[0-9]{2})?Z?([+-][0-9]{1,2})?)?)?)?)?$)|(^not applicable$)|(^not collected$)|(^not provided$)|(^restricted access$)|(^missing: control sample$)|(^missing: sample group$)|(^missing: synthetic construct$)|(^missing: lab stock$)|(^missing: third party data$)|(^missing: data agreement established pre-2023$)|(^missing: endangered species$)|(^missing: human-identifiable$)|(^missing$) + + + mandatory + single + + + + geographic_location_region_and_locality + The geographical origin of the sample as defined by the specific region name followed by the locality name. + + + + optional + single + + + + identified_by + name of the expert who identified the specimen taxonomically + + + + optional + single + + + + sample collection + + + environmental_sample + identifies sequences derived by direct molecular isolation from a bulk environmental DNA sample (by PCR with or without subsequent cloning of the product, DGGE, or other anonymous methods) with no reliable identification of the source organism + + + + No + + + Yes + + + + optional + single + + + + Organism characteristics + + + mating_type + mating type of the organism from which the sequence was obtained; mating type is used for prokaryotes, and for eukaryotes that undergo meiosis without sexually dimorphic gametes + + + + optional + single + + + + sex + sex of the organism from which the sample was obtained + + + + optional + single + + + + geographic_location_country_andor_sea + The geographical origin of where the sample was collected from, with the intention of sequencing, as defined by the country or sea name. Country or sea names should be chosen from the INSDC country list (http://insdc.org/country.html). + + + + Afghanistan + + + Albania + + + Algeria + + + American Samoa + + + Andorra + + + Angola + + + Anguilla + + + Antarctica + + + Antigua and Barbuda + + + Arctic Ocean + + + Argentina + + + Armenia + + + Aruba + + + Ashmore and Cartier Islands + + + Atlantic Ocean + + + Australia + + + Austria + + + Azerbaijan + + + Bahamas + + + Bahrain + + + Baker Island + + + Baltic Sea + + + Bangladesh + + + Barbados + + + Bassas da India + + + Belarus + + + Belgium + + + Belize + + + Benin + + + Bermuda + + + Bhutan + + + Bolivia + + + Borneo + + + Bosnia and Herzegovina + + + Botswana + + + Bouvet Island + + + Brazil + + + British Virgin Islands + + + Brunei + + + Bulgaria + + + Burkina Faso + + + Burundi + + + Cambodia + + + Cameroon + + + Canada + + + Cape Verde + + + Cayman Islands + + + Central African Republic + + + Chad + + + Chile + + + China + + + Christmas Island + + + Clipperton Island + + + Cocos Islands + + + Colombia + + + Comoros + + + Cook Islands + + + Coral Sea Islands + + + Costa Rica + + + Cote d'Ivoire + + + Croatia + + + Cuba + + + Curacao + + + Cyprus + + + Czechia + + + Czech Republic + + + Democratic Republic of the Congo + + + Denmark + + + Djibouti + + + Dominica + + + Dominican Republic + + + Ecuador + + + Egypt + + + El Salvador + + + Equatorial Guinea + + + Eritrea + + + Estonia + + + Eswatini + + + Ethiopia + + + Europa Island + + + Falkland Islands (Islas Malvinas) + + + Faroe Islands + + + Fiji + + + Finland + + + France + + + French Guiana + + + French Polynesia + + + French Southern and Antarctic Lands + + + Gabon + + + Gambia + + + Gaza Strip + + + Georgia + + + Germany + + + Ghana + + + Gibraltar + + + Glorioso Islands + + + Greece + + + Greenland + + + Grenada + + + Guadeloupe + + + Guam + + + Guatemala + + + Guernsey + + + Guinea + + + Guinea-Bissau + + + Guyana + + + Haiti + + + Heard Island and McDonald Islands + + + Honduras + + + Hong Kong + + + Howland Island + + + Hungary + + + Iceland + + + India + + + Indian Ocean + + + Indonesia + + + Iran + + + Iraq + + + Ireland + + + Isle of Man + + + Israel + + + Italy + + + Jamaica + + + Jan Mayen + + + Japan + + + Jarvis Island + + + Jersey + + + Johnston Atoll + + + Jordan + + + Juan de Nova Island + + + Kazakhstan + + + Kenya + + + Kerguelen Archipelago + + + Kingman Reef + + + Kiribati + + + Kosovo + + + Kuwait + + + Kyrgyzstan + + + Laos + + + Latvia + + + Lebanon + + + Lesotho + + + Liberia + + + Libya + + + Liechtenstein + + + Line Islands + + + Lithuania + + + Luxembourg + + + Macau + + + Madagascar + + + Malawi + + + Malaysia + + + Maldives + + + Mali + + + Malta + + + Marshall Islands + + + Martinique + + + Mauritania + + + Mauritius + + + Mayotte + + + Mediterranean Sea + + + Mexico + + + Micronesia, Federated States of + + + Midway Islands + + + Moldova + + + Monaco + + + Mongolia + + + Montenegro + + + Montserrat + + + Morocco + + + Mozambique + + + Myanmar + + + Namibia + + + Nauru + + + Navassa Island + + + Nepal + + + Netherlands + + + New Caledonia + + + New Zealand + + + Nicaragua + + + Niger + + + Nigeria + + + Niue + + + Norfolk Island + + + North Korea + + + North Macedonia + + + North Sea + + + Northern Mariana Islands + + + Norway + + + Oman + + + Pacific Ocean + + + Pakistan + + + Palau + + + Palmyra Atoll + + + Panama + + + Papua New Guinea + + + Paracel Islands + + + Paraguay + + + Peru + + + Philippines + + + Pitcairn Islands + + + Poland + + + Portugal + + + Puerto Rico + + + Qatar + + + Republic of the Congo + + + Reunion + + + Romania + + + Ross Sea + + + Russia + + + Rwanda + + + Saint Barthelemy + + + Saint Helena + + + Saint Kitts and Nevis + + + Saint Lucia + + + Saint Pierre and Miquelon + + + Saint Vincent and the Grenadines + + + Samoa + + + San Marino + + + Saint Martin + + + Sao Tome and Principe + + + Saudi Arabia + + + Senegal + + + Serbia + + + Seychelles + + + Sierra Leone + + + Singapore + + + Sint Maarten + + + Slovakia + + + Slovenia + + + Solomon Islands + + + Somalia + + + South Africa + + + South Georgia and the South Sandwich Islands + + + South Korea + + + South Sudan + + + Southern Ocean + + + Spain + + + Spratly Islands + + + Sri Lanka + + + State of Palestine + + + Sudan + + + Suriname + + + Svalbard + + + Sweden + + + Switzerland + + + Syria + + + Taiwan + + + Tajikistan + + + Tanzania + + + Tasman Sea + + + Thailand + + + Timor-Leste + + + Togo + + + Tokelau + + + Tonga + + + Trinidad and Tobago + + + Tromelin Island + + + Tunisia + + + Turkey + + + Turkmenistan + + + Turks and Caicos Islands + + + Tuvalu + + + USA + + + Uganda + + + Ukraine + + + United Arab Emirates + + + United Kingdom + + + Uruguay + + + Uzbekistan + + + Vanuatu + + + Venezuela + + + Viet Nam + + + Wake Island + + + Virgin Islands + + + Wallis and Futuna + + + West Bank + + + Western Sahara + + + Yemen + + + Zambia + + + Zimbabwe + + + missing + + + missing: control sample + + + missing: data agreement established pre-2023 + + + missing: endangered species + + + missing: human-identifiable + + + missing: lab stock + + + missing: sample group + + + missing: synthetic construct + + + missing: third party data + + + not applicable + + + not collected + + + not provided + + + restricted access + + + + mandatory + single + + + + host description + + + lab_host + scientific name of the laboratory host used to propagate the source organism from which the sample was obtained + + + + optional + single + + + + host_scientific_name + Scientific name of the natural (as opposed to laboratory) host to the organism from which sample was obtained. + + + + optional + single + + + + Pointer to physical material + + + bio_material + Unique identifier that references the biological material from which the sample was obtained and that ideally exists in a curated collection (e.g. stock centres, seed banks, DNA banks). The ID should have the following structure: name of the institution (institution code) followed by the collection code (if available) and the voucher id (institution_code:collection_code:voucher_id). Please note institution codes and collection codes are taken from a controlled vocabulary maintained by the INSDC: https://ftp.ncbi.nih.gov/pub/taxonomy/biocollections/ + + + + optional + single + + + + culture_collection + Unique identifier that references the culture (e.g. live microbial and viral cultures and cell lines) from which the sample has been obtained and that have been deposited in curated culture collections. The ID needs to provide an institution code and the culture id, with optional collection code, in the following structure: (-institution_code:(collection_code):voucher_id. Please note institution codes (and optional collection codes) are taken from a controlled vocabulary maintained by the INSDC: https://ftp.ncbi.nih.gov/pub/taxonomy/biocollections/ + + + + optional + single + + + + specimen_voucher + Unique identifier that references the physical specimen that remains after the sequence has been obtained and that ideally exists in a curated collection. The ID should have the following structure: name of the institution (institution code) followed by the collection code (if available) and the voucher id (institution_code:collection_code:voucher_id). Please note institution codes and collection codes are taken from a controlled vocabulary maintained by the INSDC: https://ftp.ncbi.nih.gov/pub/taxonomy/biocollections/ + + + + optional + single + + + + Infraspecies information + + + cultivar + cultivar (cultivated variety) of plant from which sample was obtained + + + + optional + single + + + + ecotype + a population within a given species displaying genetically based, phenotypic traits that reflect adaptation to a local habitat. + + + + optional + single + + + + isolate + individual isolate from which the sample was obtained + + + + optional + single + + + + sub_species + name of sub-species of organism from which sample was obtained + + + + optional + single + + + + variety + variety (= varietas, a formal Linnaean rank) of organism from which sample was derived. + + + + optional + single + + + + sub_strain + name or identifier of a genetically or otherwise modified strain from which sample was obtained, derived from a parental strain (which should be annotated in the strain field; sub_strain from which sample was obtained + + + + optional + single + + + + cell_line + cell line from which the sample was obtained + + + + optional + single + + + + serotype + serological variety of a species characterized by its antigenic properties + + + + optional + single + + + + serovar + serological variety of a species (usually a prokaryote) characterized by its antigenic properties + + + + optional + single + + + + strain + Name of the strain from which the sample was obtained. + + + + optional + single + + + + + diff --git a/data/sample_checklists/ERC000026.xml b/data/sample_checklists/ERC000026.xml new file mode 100644 index 0000000000..e34f856f4f --- /dev/null +++ b/data/sample_checklists/ERC000026.xml @@ -0,0 +1,149 @@ + + + + + ERC000026 + + + + EGA default checklist + The minimum sample requirements for EGA + ENA + + Organism characteristics + + + sex + sex of the organism from which the sample was obtained + + + + optional + single + + + + default + + + subject_id + Identifier for the subject where the sample has been derived from + + + + recommended + single + + + + gender + Sex + + + + female + + + male + + + unknown + + + + recommended + single + + + + phenotype + Where possible, please use the Experimental Factor Ontology (EFO) to describe your phenotypes. + + + + mandatory + single + + + + disease_site + Affected organ + + + + optional + single + + + + sample_type + Affected organ + + + + blood + + + bone metastases + + + cell line + + + csf + + + diseased tissue + + + lymph node metastases + + + metastatic + + + normal adjacent from prostate + + + normal from other tissue + + + normal prostate from healthy individual + + + normal tissue + + + plasma + + + primary sample + + + serum + + + soft tissue metastases + + + urine + + + + optional + single + + + + donor_id + Identifier of the donor where the sample has been derived from + + + + optional + single + + + + + diff --git a/lib/accession/tag.rb b/lib/accession/tag.rb index 409d2daa37..c0d64a7ea2 100644 --- a/lib/accession/tag.rb +++ b/lib/accession/tag.rb @@ -129,8 +129,7 @@ def value_for(record, key) class TagCollectionDate < Tag # rubocop:disable Layout/LineLength REGEXP = - %r{(^[12][0-9]{3}(-(0[1-9]|1[0-2])(-(0[1-9]|[12][0-9]|3[01])(T[0-9]{2}:[0-9]{2}(:[0-9]{2})?Z?([+-][0-9]{1,2})?)?)?)?(/[0-9]{4}(-[0-9]{2}(-[0-9]{2}(T[0-9]{2}:[0-9]{2}(:[0-9]{2})?Z?([+-][0-9]{1,2})?)?)?)?)?$)} - + %r{(^[12][0-9]{3}(-(0[1-9]|1[0-2])(-(0[1-9]|[12][0-9]|3[01])(T[0-9]{2}:[0-9]{2}(:[0-9]{2})?Z?([+-][0-9]{1,2})?)?)?)?(/[0-9]{4}(-[0-9]{2}(-[0-9]{2}(T[0-9]{2}:[0-9]{2}(:[0-9]{2})?Z?([+-][0-9]{1,2})?)?)?)?)?$)|(^not applicable$)|(^not collected$)|(^not provided$)|(^restricted access$)|(^missing: control sample$)|(^missing: sample group$)|(^missing: synthetic construct$)|(^missing: lab stock$)|(^missing: third party data$)|(^missing: data agreement established pre-2023$)|(^missing: endangered species$)|(^missing: human-identifiable$)|(^missing$)} # rubocop:enable Layout/LineLength def value_for(record, key) diff --git a/lib/insdc/import_countries.rb b/lib/insdc/import_countries.rb index db86dc4efe..a6fe1c23db 100644 --- a/lib/insdc/import_countries.rb +++ b/lib/insdc/import_countries.rb @@ -4,9 +4,9 @@ # Handles the download and import of permitted country fields from the ENA class Insdc::ImportCountries - FILE_ROOT = Rails.root.join('data/ena_sample_checklists') + FILE_ROOT = Rails.root.join('data/sample_checklists') FIELD_NAME = 'geographic location (country and/or sea)' - XPATH = "//FIELD//NAME[text() = '#{FIELD_NAME}']/following-sibling::FIELD_TYPE//TEXT_VALUE//VALUE".freeze + XPATH = "//FIELD//LABEL[text() = '#{FIELD_NAME}']/following-sibling::FIELD_TYPE//TEXT_VALUE//VALUE".freeze def initialize(ena_root:, sample_checklist:, priorities: {}) @ena_root = ena_root @@ -24,18 +24,37 @@ def download(force: false) def import no_file_error if file_missing? - pending_countries = countries_to_import + existing_valid_countries = Insdc::Country.valid_state.pluck(:name) + existing_invalid_countries = Insdc::Country.invalid_state.pluck(:name) + existing_countries = Insdc::Country.pluck(:name) - # Mark countries as invalid if they aren't on the list - Insdc::Country.find_each { |country| country.invalid! unless pending_countries.delete(country.name) } + existing_countries_to_mark_valid = countries_to_import & existing_invalid_countries + existing_countries_to_mark_invalid = existing_valid_countries - countries_to_import + new_countries_to_add = countries_to_import - existing_countries - generate_countries(pending_countries) + mark_as_valid(existing_countries_to_mark_valid) + mark_as_invalid(existing_countries_to_mark_invalid) + add_countries(new_countries_to_add) + + Rails.logger.info { "#{existing_countries_to_mark_valid.size} existing countries marked as valid" } + Rails.logger.info { "#{existing_countries_to_mark_invalid.size} existing countries marked as invalid" } + Rails.logger.info { "#{new_countries_to_add.size} new countries added" } end private - def generate_countries(pending_countries) - Insdc::Country.import(pending_countries.map { |name| { name: name, sort_priority: priority_for(name) } }) + def mark_as_valid(countries_to_mark) + Insdc::Country.where(name: countries_to_mark).find_each(&:valid!) + end + + def mark_as_invalid(countries_to_mark) + Insdc::Country.where(name: countries_to_mark).find_each(&:invalid!) + end + + def add_countries(new_countries_to_add) + Insdc::Country.import(new_countries_to_add.map do |name| + { name: name, sort_priority: priority_for(name), validation_state: :valid } + end) end def priority_for(name) @@ -47,7 +66,7 @@ def xml end def countries_to_import - xml.root.get_elements(XPATH).map(&:text) + @countries_to_import ||= xml.root.get_elements(XPATH).map(&:text) end def url diff --git a/lib/tasks/insdc/import_countries.rake b/lib/tasks/insdc/import_countries.rake index 791903e3f0..052b73897d 100644 --- a/lib/tasks/insdc/import_countries.rake +++ b/lib/tasks/insdc/import_countries.rake @@ -14,6 +14,7 @@ unless defined?(INSDC_COUNTRIES_PRIORITIES) INSDC_COUNTRIES_PRIORITIES = { 'not provided' => 2, 'United Kingdom' => 1, + 'missing' => -1, 'not applicable' => -1, 'not collected' => -1, 'restricted access' => -1 diff --git a/spec/insdc/import_countries_spec.rb b/spec/insdc/import_countries_spec.rb index 3c81d5927d..7d77afd196 100644 --- a/spec/insdc/import_countries_spec.rb +++ b/spec/insdc/import_countries_spec.rb @@ -46,8 +46,8 @@ - geographic location (country and/or sea) - The geographical origin of the sample as defined by the country or sea. Country or sea names should be chosen from the INSDC country list (http://insdc.org/country.html). + geographic_location_country_andor_sea + The geographical origin of where the sample was collected from, with the intention of sequencing, as defined by the country or sea name. Country or sea names should be chosen from the INSDC country list (http://insdc.org/country.html). @@ -131,8 +131,8 @@ context 'when the file is present' do before do - create(:insdc_country, name: 'Historic Coldland') - create(:insdc_country, name: 'East Westland') + create(:insdc_country, :valid, name: 'Historic Coldland') + create(:insdc_country, :invalid, name: 'East Westland') allow(File).to receive(:exist?).with(cached_file_path).and_return(true) allow(File).to receive(:open).with(cached_file_path).and_yield(mock_response) importer.import @@ -152,6 +152,11 @@ ) end + it 're-validates existing entries' do + added_country = Insdc::Country.find_by!(name: 'East Westland') + expect(added_country).to have_attributes(name: 'East Westland', sort_priority: 0, validation_state: 'valid') + end + it 'can set priorities' do added_country = Insdc::Country.find_by!(name: 'not applicable') expect(added_country).to have_attributes(name: 'not applicable', sort_priority: 1, validation_state: 'valid') diff --git a/spec/lib/accession/sample_spec.rb b/spec/lib/accession/sample_spec.rb index 511e2e53bf..790501709b 100644 --- a/spec/lib/accession/sample_spec.rb +++ b/spec/lib/accession/sample_spec.rb @@ -18,7 +18,10 @@ def find_value_at_tag(xml_received, tag_name) RSpec.describe Accession::Sample, :accession, type: :model do let(:tag_list) { build(:standard_accession_tag_list) } - before { @country = create(:insdc_country, name: 'Australia') } + before do + create(:insdc_country, name: 'Australia') + create(:insdc_country, name: 'Niue') + end it 'is not sent for accessioning if the sample has already been accessioned' do sample = @@ -41,55 +44,51 @@ def find_value_at_tag(xml_received, tag_name) expect(described_class.new(tag_list, sample)).not_to be_valid end - it "is not sent for accessioning if the sample doesn't have the required fields" do - sample = - create( - :sample_for_accessioning_with_open_study, - sample_metadata: create(:sample_metadata_for_accessioning, sample_taxon_id: nil) - ) - expect(described_class.new(tag_list, sample)).not_to be_valid + context 'when validating' do + let(:sample_metadata) { create(:sample_metadata_for_accessioning) } - sample = - create( - :sample_for_accessioning_with_open_study, - sample_metadata: create(:sample_metadata_for_accessioning, sample_common_name: nil) - ) - expect(described_class.new(tag_list, sample)).not_to be_valid + context 'with an open study' do + let(:sample) { create(:sample_for_accessioning_with_open_study, sample_metadata:) } - sample = - create( - :sample_for_accessioning_with_managed_study, - sample_metadata: create(:sample_metadata_for_accessioning, gender: nil) - ) - expect(described_class.new(tag_list, sample)).not_to be_valid + it 'is required to define sample_taxon_id' do + sample.sample_metadata.sample_taxon_id = nil + expect(described_class.new(tag_list, sample)).not_to be_valid + end - sample = - create( - :sample_for_accessioning_with_managed_study, - sample_metadata: create(:sample_metadata_for_accessioning, phenotype: nil) - ) - expect(described_class.new(tag_list, sample)).not_to be_valid + it 'is required to define sample_common_name' do + sample.sample_metadata.sample_common_name = nil + expect(described_class.new(tag_list, sample)).not_to be_valid + end + end - sample = - create( - :sample_for_accessioning_with_managed_study, - sample_metadata: create(:sample_metadata_for_accessioning, donor_id: nil) - ) - expect(described_class.new(tag_list, sample)).not_to be_valid + context 'with a managed study' do + let(:sample) { create(:sample_for_accessioning_with_managed_study, sample_metadata:) } - sample = - create( - :sample_for_accessioning_with_managed_study, - sample_metadata: create(:sample_metadata_for_accessioning, sample_taxon_id: nil) - ) - expect(described_class.new(tag_list, sample)).not_to be_valid + it 'is required to define sample_taxon_id' do + sample.sample_metadata.sample_taxon_id = nil + expect(described_class.new(tag_list, sample)).not_to be_valid + end - sample = - create( - :sample_for_accessioning_with_managed_study, - sample_metadata: create(:sample_metadata_for_accessioning, sample_common_name: nil) - ) - expect(described_class.new(tag_list, sample)).not_to be_valid + it 'is required to define sample_common_name' do + sample.sample_metadata.sample_common_name = nil + expect(described_class.new(tag_list, sample)).not_to be_valid + end + + it 'is required to define gender' do + sample.sample_metadata.gender = nil + expect(described_class.new(tag_list, sample)).not_to be_valid + end + + it 'is required to define phenotype' do + sample.sample_metadata.phenotype = nil + expect(described_class.new(tag_list, sample)).not_to be_valid + end + + it 'is required to define donor_id' do + sample.sample_metadata.donor_id = nil + expect(described_class.new(tag_list, sample)).not_to be_valid + end + end end it 'an appropriate service should be chosen based on the associated study' do @@ -125,16 +124,16 @@ def find_value_at_tag(xml_received, tag_name) end it 'creates some xml with valid attributes' do - sample = described_class.new(tag_list, create(:sample_for_accessioning_with_open_study)) - xml = sample.to_xml + accession_sample = described_class.new(tag_list, create(:sample_for_accessioning_with_open_study)) + xml = accession_sample.to_xml expect(xml).to match(%r{.*}m) expect(xml).to include(%r{.*.*}m) - expect(xml).to include("alias=\"#{sample.ebi_alias}\"") - expect(xml).to include("#{sample.title}") + expect(xml).to include("alias=\"#{accession_sample.ebi_alias}\"") + expect(xml).to include("#{accession_sample.title}") - tags = sample.tags.by_group[:sample_name] + tags = accession_sample.tags.by_group[:sample_name] sample_name_tags = xml tags.each do |_label, tag| expected_tag = tag.label.tr(' ', '_').upcase @@ -143,104 +142,272 @@ def find_value_at_tag(xml_received, tag_name) sample_attributes_tags = xml - tags = sample.tags.by_group[:sample_attributes] + tags = accession_sample.tags.by_group[:sample_attributes] expect(sample_attributes_tags).to include(*tags.labels.map { |label| "#{label}" }) expect(sample_attributes_tags).to include(*tags.values.map { |value| "#{value}" }) - tags = sample.tags.by_group[:array_express] + tags = accession_sample.tags.by_group[:array_express] expect(sample_attributes_tags).to include(*tags.array_express_labels.map { |label| "#{label}" }) expect(sample_attributes_tags).to include(*tags.values.map { |value| "#{value}" }) - sample = described_class.new(tag_list, create(:sample_for_accessioning_with_managed_study)) - xml = sample.to_xml + accession_sample = described_class.new(tag_list, create(:sample_for_accessioning_with_managed_study)) + xml = accession_sample.to_xml sample_attributes_tags = xml expect(sample_attributes_tags).not_to include(*tags.array_express_labels.map { |label| "#{label}" }) end - it 'can update accession number for sample' do - sample = described_class.new(tag_list, create(:sample_for_accessioning_with_open_study)) - expect(sample.update_accession_number('ENA1234')).to be_truthy - expect(sample.ebi_accession_number).to eq('ENA1234') + describe '#update_accession_number' do + let(:sample) { create(:sample_for_accessioning_with_open_study) } + let(:accession_sample) { described_class.new(tag_list, sample) } + let(:test_accession_number) { 'ENA12345' } + + before do + expect(accession_sample.ebi_accession_number).to be_nil + accession_sample.update_accession_number(test_accession_number) + end + + it 'sets the ebi_accession_number to the provided value' do + expect(accession_sample.ebi_accession_number).to eq(test_accession_number) + end end describe '#to_xml' do + let(:sample) { create(:sample_for_accessioning_with_open_study) } + let(:accession_sample) { described_class.new(tag_list, sample) } + let(:xml) { accession_sample.to_xml } + context 'with country of origin' do it 'includes country of origin' do - sample = described_class.new(tag_list, create(:sample_for_accessioning_with_open_study)) - expect(sample.to_xml).to include(COUNTRY_TAG) - expect(find_value_at_tag(sample.to_xml, COUNTRY_TAG)).to eq('Australia') + expect(xml).to include(COUNTRY_TAG) + end + + it 'displays the country when country is specified' do + expect(find_value_at_tag(xml, COUNTRY_TAG)).to eq('Australia') end it 'displays not provided when country is empty' do - smpl = create(:sample_for_accessioning_with_open_study) - smpl.sample_metadata.update(country_of_origin: nil) - sample = described_class.new(tag_list, smpl) - expect(sample.to_xml).to include(COUNTRY_TAG) - expect(find_value_at_tag(sample.to_xml, COUNTRY_TAG)).to eq('not provided') + sample.sample_metadata.update(country_of_origin: nil) + expect(find_value_at_tag(xml, COUNTRY_TAG)).to eq('not provided') end it 'displays not provided when country value is not provided' do - smpl = create(:sample_for_accessioning_with_open_study) - smpl.sample_metadata.update(country_of_origin: 'not provided') - sample = described_class.new(tag_list, smpl) - expect(sample.to_xml).to include(COUNTRY_TAG) - expect(find_value_at_tag(sample.to_xml, COUNTRY_TAG)).to eq('not provided') + sample.sample_metadata.update(country_of_origin: 'not provided') + expect(find_value_at_tag(xml, COUNTRY_TAG)).to eq('not provided') end it 'displays not provided when country value is wrong' do - smpl = create(:sample_for_accessioning_with_open_study) - smpl.sample_metadata.update(country_of_origin: 'Freedonia') - sample = described_class.new(tag_list, smpl) - expect(sample.to_xml).to include(COUNTRY_TAG) - expect(find_value_at_tag(sample.to_xml, COUNTRY_TAG)).to eq('not provided') + sample.sample_metadata.update(country_of_origin: 'Freedonia') + expect(find_value_at_tag(xml, COUNTRY_TAG)).to eq('not provided') end it 'displays missing when country of origin is specified as missing' do - smpl = create(:sample_for_accessioning_with_open_study) - smpl.sample_metadata.update(country_of_origin: 'missing: human-identifiable') - sample = described_class.new(tag_list, smpl) - expect(sample.to_xml).to include(COUNTRY_TAG) - expect(find_value_at_tag(sample.to_xml, COUNTRY_TAG)).to eq('missing: human-identifiable') + sample.sample_metadata.update(country_of_origin: 'missing: human-identifiable') + expect(find_value_at_tag(xml, COUNTRY_TAG)).to eq('missing: human-identifiable') end end context 'with collection date' do it 'includes collection date' do - sample = described_class.new(tag_list, create(:sample_for_accessioning_with_open_study)) - expect(sample.to_xml).to include(COLLECTION_DATE_TAG) - expect(find_value_at_tag(sample.to_xml, COLLECTION_DATE_TAG)).to eq('2000-01-01T00:00') + expect(xml).to include(COLLECTION_DATE_TAG) + end + + it 'displays the collection date when correctly specified' do + expect(find_value_at_tag(xml, COLLECTION_DATE_TAG)).to eq('2000-01-01T00:00') end it 'displays not provided when collection date is empty' do - smpl = create(:sample_for_accessioning_with_open_study) - smpl.sample_metadata.update(date_of_sample_collection: nil) - sample = described_class.new(tag_list, smpl) - expect(sample.to_xml).to include(COLLECTION_DATE_TAG) - expect(find_value_at_tag(sample.to_xml, COLLECTION_DATE_TAG)).to eq('not provided') + sample.sample_metadata.update(date_of_sample_collection: nil) + expect(find_value_at_tag(xml, COLLECTION_DATE_TAG)).to eq('not provided') end it 'displays not provided when collection date is not provided' do - smpl = create(:sample_for_accessioning_with_open_study) - smpl.sample_metadata.update(date_of_sample_collection: 'not provided') - sample = described_class.new(tag_list, smpl) - expect(sample.to_xml).to include(COLLECTION_DATE_TAG) - expect(find_value_at_tag(sample.to_xml, COLLECTION_DATE_TAG)).to eq('not provided') + sample.sample_metadata.update(date_of_sample_collection: 'not provided') + expect(find_value_at_tag(xml, COLLECTION_DATE_TAG)).to eq('not provided') end it 'displays not provided when collection date is wrong' do - smpl = create(:sample_for_accessioning_with_open_study) - smpl.sample_metadata.update(date_of_sample_collection: '2000-99-01T00:00') - sample = described_class.new(tag_list, smpl) - expect(sample.to_xml).to include(COLLECTION_DATE_TAG) - expect(find_value_at_tag(sample.to_xml, COLLECTION_DATE_TAG)).to eq('not provided') + sample.sample_metadata.update(date_of_sample_collection: '2000-99-01T00:00') + expect(find_value_at_tag(xml, COLLECTION_DATE_TAG)).to eq('not provided') end it 'displays missing when collection date is specified as missing' do - smpl = create(:sample_for_accessioning_with_open_study) - smpl.sample_metadata.update(date_of_sample_collection: 'missing: human-identifiable') - sample = described_class.new(tag_list, smpl) - expect(sample.to_xml).to include(COLLECTION_DATE_TAG) - expect(find_value_at_tag(sample.to_xml, COLLECTION_DATE_TAG)).to eq('missing: human-identifiable') + sample.sample_metadata.update(date_of_sample_collection: 'missing: human-identifiable') + expect(find_value_at_tag(xml, COLLECTION_DATE_TAG)).to eq('missing: human-identifiable') + end + end + + context 'with all possible tags' do + # This section shows which tags are generated in the XML under which circumstances. + # This is technically more of an integration test as it involves several classes, but I + # needed a clear way to easily validate generated output. + + # Uses the actual tag list loaded from config, not the factory + before do + Accession.configure do |config| + config.folder = File.join('config', 'accession') + config.load! + end + end + + let(:tag_list) { Accession.configuration.tags } + let(:sample_metadata) do + create( + :minimal_sample_metadata_for_accessioning, + + # "Standard" and ENA + organism: 'organism', + cohort: 'cohort', + country_of_origin: 'Niue', # A South Pacific island, now you know: https://en.wikipedia.org/wiki/Niue + geographical_region: 'geographical_region', + ethnicity: 'ethnicity', + volume: 'volume', + mother: 'mother', + father: 'father', + replicate: 'replicate', + gc_content: 'High AT', + gender: 'Female', + donor_id: 'donor_id', + dna_source: 'Brain', + sample_public_name: 'sample_public_name', + sample_ebi_accession_number: 'sample_ebi_accession_number', + sample_description: 'sample_description', + sample_sra_hold: 'Protect', + sibling: 'sibling', + is_resubmitted: 'is_resubmitted', + date_of_sample_collection: '2020-02-02', + date_of_sample_extraction: 'date_of_sample_extraction', + sample_extraction_method: 'sample_extraction_method', + sample_purified: 'sample_purified', + purification_method: 'purification_method', + concentration: 'concentration', + concentration_determined_by: 'concentration_determined_by', + sample_type: 'sample_type', + sample_storage_conditions: 'sample_storage_conditions', + collected_by: 'collected_by', + + # Array Express + genotype: 'genotype', + phenotype: 'phenotype', + sample_strain_att: 'sample_strain_att', # strain + age: '23 seconds', + developmental_stage: 'developmental_stage', + cell_type: 'cell_type', + disease_state: 'disease_state', + compound: 'compound', + dose: '50 units', + immunoprecipitate: 'immunoprecipitate', + growth_condition: 'growth_condition', + rnai: 'rnai', + organism_part: 'organism_part', + time_point: 'time_point', + + # EGA + treatment: 'treatment', + subject: 'subject', + disease: 'disease', + genome_size: 'genome_size', + consent_withdrawn: 'consent_withdrawn', + date_of_consent_withdrawn: 'date_of_consent_withdrawn', + user_id_of_consent_withdrawn: 'user_id_of_consent_withdrawn' + ) + end + + RSpec.shared_examples 'the tags are correctly included in the generated XML' do + let(:xml_sample_attributes) do + doc = Nokogiri::XML(xml) # takes in the XML generated by accession_sample.to_xml + sample_attributes = doc.at('SAMPLE_ATTRIBUTES') + return {} unless sample_attributes + + sample_attributes.search('SAMPLE_ATTRIBUTE').each_with_object({}) do |attr, hash| + tag = attr.at('TAG')&.text + value = attr.at('VALUE')&.text + hash[tag] = value + end + end + + it 'expects all provided expected values to be non-nil' do + # This is to make sure that values are sourced from the data locations above and are not defaulting to nil + nil_tags = expected_tags_and_values.select { |_, value| value.nil? }.keys + expect(nil_tags).to be_empty, "Expected non-nil values for tags: #{nil_tags.join(', ')}" + end + + it 'includes the EBI names of all expected tags' do + missing_tags = expected_tags_and_values.keys - xml_sample_attributes.keys + expect(missing_tags).to be_empty, + "Expected XML to include tags: '#{missing_tags.join("', '")}' " \ + "but only tags '#{xml_sample_attributes.keys.join("', '")}' were found" + end + + it 'includes the correct values for all expected tags' do + tag_value_received = expected_tags_and_values.filter_map do |tag, value| + [tag, value, xml_sample_attributes[tag]] if xml_sample_attributes[tag] != value + end + expect(tag_value_received) + .to be_empty, 'Incorrect tag values found: ' \ + "#{tag_value_received.map do |tag, expected, received| + "'#{tag}': expected #{expected.inspect}, received #{received.inspect}" + end.join('; ')}" + end + + it 'does not include tags not in the expected tag list' do + unexpected_tags = xml_sample_attributes.keys - expected_tags_and_values.keys + expect(unexpected_tags).to be_empty, + "Unexpected tags found in XML: '#{unexpected_tags.join("', '")}'" + end + end + + context 'with an OPEN study' do # study emphasised for easy test failure identification + let(:sample) { create(:sample_for_accessioning_with_open_study, sample_metadata:) } + let(:expected_tags_and_values) do + { + # EBI tag name => datasource + 'collection date' => sample.sample_metadata.date_of_sample_collection, + 'gender' => sample.sample_metadata.gender.downcase, + 'geographic location (country and/or sea)' => sample.sample_metadata.country_of_origin, + 'phenotype' => sample.sample_metadata.phenotype, + 'sample description' => sample.sample_metadata.sample_description, + 'strain' => sample.sample_metadata.sample_strain_att, + 'subject id' => sample.sample_metadata.donor_id, + 'ArrayExpress-AGE' => sample.sample_metadata.age, + 'ArrayExpress-CELL_TYPE' => sample.sample_metadata.cell_type, + 'ArrayExpress-COMPOUND' => sample.sample_metadata.compound, + 'ArrayExpress-DEVELOPMENTAL_STAGE' => sample.sample_metadata.developmental_stage, + 'ArrayExpress-DISEASE_STATE' => sample.sample_metadata.disease_state, + 'ArrayExpress-DOSE' => sample.sample_metadata.dose, + 'ArrayExpress-GENOTYPE' => sample.sample_metadata.genotype, + 'ArrayExpress-GROWTH_CONDITION' => sample.sample_metadata.growth_condition, + 'ArrayExpress-IMMUNOPRECIPITATE' => sample.sample_metadata.immunoprecipitate, + 'ArrayExpress-ORGANISM_PART' => sample.sample_metadata.organism_part, + 'ArrayExpress-PHENOTYPE' => sample.sample_metadata.phenotype, + 'ArrayExpress-RNAI' => sample.sample_metadata.rnai, + 'ArrayExpress-SEX' => sample.sample_metadata.gender.downcase, + 'ArrayExpress-SPECIES' => sample.sample_metadata.sample_common_name, + 'ArrayExpress-STRAIN_OR_LINE' => sample.sample_metadata.sample_strain_att, + 'ArrayExpress-TIME_POINT' => sample.sample_metadata.time_point, + 'ArrayExpress-TREATMENT' => sample.sample_metadata.treatment + } + end + + it_behaves_like 'the tags are correctly included in the generated XML' + end + + context 'with a MANAGED study' do # study emphasised for easy test failure identification + let(:sample) { create(:sample_for_accessioning_with_managed_study, sample_metadata:) } + + let(:expected_tags_and_values) do + { + # EBI tag name => datasource + 'collection date' => sample.sample_metadata.date_of_sample_collection, + 'gender' => sample.sample_metadata.gender.downcase, + 'geographic location (country and/or sea)' => sample.sample_metadata.country_of_origin, + 'phenotype' => sample.sample_metadata.phenotype, + 'sample description' => sample.sample_metadata.sample_description, + 'strain' => sample.sample_metadata.sample_strain_att, + 'subject id' => sample.sample_metadata.donor_id + } + end + + it_behaves_like 'the tags are correctly included in the generated XML' end end end diff --git a/test/data/xsd/SRA.common.xsd b/test/data/xsd/SRA.common.xsd index fe143db847..99d74d6c9a 100644 --- a/test/data/xsd/SRA.common.xsd +++ b/test/data/xsd/SRA.common.xsd @@ -1,8 +1,76 @@ - - - + + + + + + + + + + + Submitter designated name for the object. The name must be unique within the submission account. + + + + + + + The center name of the submitter. + + + + + + + The center name of the broker. + + + + + + + The object accession assigned by the archive. + + + + + + + + + + + + + Identifies an object by name within the namespace defined by attribute "refcenter". + + + + + + + The namespace of the attribute "refname". + + + + + + + Identifies a record by its accession. The scope of resolution is the entire Archive. + + + + @@ -66,13 +134,13 @@ - - - - Alternative/explanatory description of the same object/identifier. - - - + + + + Alternative/explanatory description of the same object/identifier. + + + @@ -104,8 +172,7 @@ A secondary identifier in the INSDC namespace. - + An identifer rom a public non-INSDC resource. @@ -122,6 +189,7 @@ + @@ -165,6 +233,7 @@ + @@ -203,24 +272,7 @@ - - - - - - - Text label to display for the link. - - - - - - The internet service link (file:, http:, ftp: etc). - - - - - + @@ -234,8 +286,7 @@ - + Numeric record id meaningful to the NCBI Entrez system. @@ -264,7 +315,6 @@ - @@ -278,8 +328,7 @@ - + Number of base/color calls, cycles, or flows per spot (raw sequence length or flow length including all @@ -292,14 +341,12 @@ - + READ_INDEX starts at 0 and is incrementally increased for each sequential READ_SPEC within a SPOT_DECODE_SPEC - + READ_LABEL is a name for this tag, and can be used to on output to determine read name, for example F or R. @@ -341,16 +388,14 @@ - + Specify the read index that precedes this read. - + Specify the read index that follows this read. @@ -383,24 +428,21 @@ - + When match occurs, the read will be tagged with this group membership - + Minimum number of matches to trigger identification. - + Maximum number of mismatches @@ -448,16 +490,14 @@ - + Specify whether the spot should have a default length for this tag if the expected base cannot be matched. - + Specify an optional starting point for tag (base offset from 1). @@ -479,103 +519,170 @@ - The PLATFORM record selects which sequencing platform and platform-specific runtime parameters. This will be + The PLATFORM record selects which sequencing platform and platform-specific runtime parameters. This will be determined by the Center. - - 454 technology use 1-color sequential flows - - - - - - + + 454 technology use 1-color sequential flows + + + + + + - - Illumina is 4-channel flowgram with 1-to-1 mapping between basecalls and flows - - - - - - + + Illumina is 4-channel flowgram with 1-to-1 mapping between basecalls and flows + + + + + + - - Helicos is similar to 454 technology - uses 1-color sequential flows - - - - - - - - - - ABI is 4-channel flowgram with 1-to-1 mapping between basecalls and flows - - - - - - - - - - CompleteGenomics platform type. At present there is no instrument model. - - - - - - - - - - Oxford Nanopore platform type. nanopore-based electronic single molecule analysis - - - - - - - - - - PacificBiosciences platform type for the single molecule real time (SMRT) technology. - - - - - - - - - - Ion Torrent Personal Genome Machine (PGM) from Life Technologies. - - - - - - - - - - Sequencers based on capillary electrophoresis technology manufactured by LifeTech (formerly Applied + + Helicos is similar to 454 technology - uses 1-color sequential flows + + + + + + + + + + ABI is 4-channel flowgram with 1-to-1 mapping between basecalls and flows + + + + + + + + + + CompleteGenomics platform type. At present there is no instrument model. + + + + + + + + + + + + + + + + + + + + Oxford Nanopore platform type. nanopore-based electronic single molecule analysis + + + + + + + + + + PacificBiosciences platform type for the single molecule real time (SMRT) technology. + + + + + + + + + + Ion Torrent Personal Genome Machine (PGM) from Life Technologies. + + + + + + + + + + Sequencers based on capillary electrophoresis technology manufactured by LifeTech (formerly Applied BioSciences). - - - - - - - + + + + + + + + + + Sequencers based on DNBSEQ by MGI Tech. + + + + + + + + + + + + + + + + + + + + + + + + semi-conductor based sequencing technology. + + + + + + + + + + Chip based electronic sensing of polymerase extension reaction + + + + + + + + + + + + + + + + + + + + + - @@ -610,7 +717,6 @@ - The PipelineType identifies the sequence or tree of actions to @@ -628,8 +734,7 @@ - + STEP_INDEX of the previous step in the workflow. Set toNIL if the first pipe section. @@ -670,6 +775,7 @@ + Reference assembly details. @@ -715,15 +821,13 @@ - + Text label to display for the link. - + The internet service link (file:, http:, ftp:, etc). @@ -737,6 +841,7 @@ + Reference assembly and sequence details. @@ -782,6 +887,7 @@ + @@ -789,7 +895,7 @@ Generic processing pipeline specification. - + Processing directives tell the Sequence Read Archive how to treat the input data, if any treatment is requested. @@ -798,111 +904,200 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Undifferentiated early AB SOLiD system - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Undifferentiated early AB SOLiD system + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/test/data/xsd/SRA.sample.xsd b/test/data/xsd/SRA.sample.xsd index 3358df07a9..0ff744e146 100644 --- a/test/data/xsd/SRA.sample.xsd +++ b/test/data/xsd/SRA.sample.xsd @@ -1,8 +1,17 @@ - - + + - + @@ -12,106 +21,91 @@ accession or an anonymized individual identifier. Or, it may fully specify provenance and isolation method of the starting material. - - - - - - - - + + + + + + + Short text that can be used to call out sample records in search results or in displays. - - - - - - - - + + + + + + + + NCBI Taxonomy Identifier. This is appropriate for individual organisms and some environmental samples. - - - - - - Scientific name of sample that distinguishes its taxonomy. Please use a - name or synonym that is tracked in the INSDC Taxonomy database. + + + + + + Scientific name of sample that distinguishes its taxonomy. Please use a + name or synonym that is tracked in the INSDC Taxonomy database. Also, this field can be used to confirm the TAXON_ID setting. - - - - - + + + + + GenBank common name of the organism. Examples: human, mouse. - - - - - - Anonymous public name of the sample. For example, HapMap human isolate NA12878. - - - - - - - Individual name of the sample. This field can be used to identify the individual identity of - a sample where appropriate (this is usually NOT appropriate for human subjects). Example: - "Glennie" the platypus. - - - - - - - - - + + + + + + + + + Free-form text describing the sample, its origin, and its method of isolation. - - - - - - - + + + + + + + Links to resources related to this sample or sample set (publication, datasets, online databases). - - - - - - - - - - - - Properties and attributes of a sample. These can be entered as free-form + + + + + + + + + + + + Properties and attributes of a sample. These can be entered as free-form tag-value pairs. For certain studies, submitters may be asked to follow a community established ontology when describing the work. - - - - - - - - - - + + + + + + + + + + + - - + + @@ -121,9 +115,9 @@ SAMPLE_SET serves as a container for a set of samples and a name space - for establishing referential integrity between them. + for establishing referential integrity between them. - +