diff --git a/CHANGELOG.md b/CHANGELOG.md index 520f48b..9eeb6c6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +- Add collective language codes from ISO 639-5 + ## [0.7.0] - 2023-03-08 ### Added diff --git a/README.adoc b/README.adoc index 81906d1..825b28b 100644 --- a/README.adoc +++ b/README.adoc @@ -52,9 +52,11 @@ russian = Languages['Russian'] # Languages can be retrieved via reference name, klingon = Languages['KLINgon'] # weird casing, but still works invalid = Languages[:invalid] # invalid or unknown names or ISO codes returns nil + +mayan_languages = Languages[:myn] # You can also access ISO 639-5 collective codses ---- -.Get all ISO 639-3 languages +.Get all ISO 639-3 languages and ISO 639-5 collective language codes [source] Languages.all @@ -104,7 +106,7 @@ language = Language[:fr] language.name # => French language.alpha2 # => :fr (alias for #iso639_1) -language.alpha3 # => :fra (alias for #iso639_3) +language.alpha3 # => :fra language.alpha3_bibliographic # => :fre (alias for #iso_639_2b) language.alpha3_terminology # => :fra (alias for #iso_639_2t) language.type # => :living @@ -136,41 +138,47 @@ macrolanguage.macrolanguage # => nil Why to build another gem for ISO 639? .Overview -[%header,cols="2,1,2,2,3"] +[%header,cols="2,1,2,2,2,3"] |=== |Gem |ISO 639-1/-2 |ISO 639-3 +|ISO 639-5 |Translations |Data Storage |https://rubygems.org/gems/iso639[iso639] |✅ |❌ +|✅ |French |Collection of Hashes |https://rubygems.org/gems/iso-639[iso-639] |✅ |❌ +|✅ |French |Array of Arrays |https://rubygems.org/gems/iso-639-data[iso-639-data] |✅ |(✅) only scope individual +|✅ |French for ISO 639-2 |Hash of Hashes |https://rubygems.org/gems/language_list[language_list] |✅ |(✅) only scope individual +|❌ |- |Array of Language-Objects |https://rubygems.org/gems/human_languages[human_languages] |✅ |✅ +|✅ |- |Array of Language-Objects |=== diff --git a/bin/update-data b/bin/update-data index 3c4d54f..5db5ef7 100755 --- a/bin/update-data +++ b/bin/update-data @@ -3,14 +3,18 @@ require 'net/http' -def download_file(filename) - src = URI.parse("https://iso639-3.sil.org/sites/iso639-3/files/downloads/#{filename}.tab") - dest = "#{__dir__}/../data/#{filename}.tsv" +def download_file(url) + src = URI.parse(url) + filename = url.split('/').last.gsub('.tab', '.tsv') + dest = "#{__dir__}/../data/#{filename}" File.write(dest, Net::HTTP.get(src)) end # Download ISO 639-3 data -download_file('iso-639-3') +download_file('https://iso639-3.sil.org/sites/iso639-3/files/downloads/iso-639-3.tab') # Download macrolanguage data -download_file('iso-639-3-macrolanguages') +download_file('https://iso639-3.sil.org/sites/iso639-3/files/downloads/iso-639-3-macrolanguages.tab') + +# Download ISO 639-5 data +download_file('https://id.loc.gov/vocabulary/iso639-5.tsv') diff --git a/data/iso639-5.tsv b/data/iso639-5.tsv new file mode 100644 index 0000000..e0534c9 --- /dev/null +++ b/data/iso639-5.tsv @@ -0,0 +1,116 @@ +URI code Label (English) Label (French) +http://id.loc.gov/vocabulary/iso639-5/aav aav Austro-Asiatic languages austro-asiatiques, langues +http://id.loc.gov/vocabulary/iso639-5/afa afa Afro-Asiatic languages afro-asiatiques, langues +http://id.loc.gov/vocabulary/iso639-5/alg alg Algonquian languages algonquines, langues +http://id.loc.gov/vocabulary/iso639-5/alv alv Atlantic-Congo languages atlantique-congo, langues +http://id.loc.gov/vocabulary/iso639-5/apa apa Apache languages apaches, langues +http://id.loc.gov/vocabulary/iso639-5/aqa aqa Alacalufan languages alacalufanes, langues +http://id.loc.gov/vocabulary/iso639-5/aql aql Algic languages algiques, langues +http://id.loc.gov/vocabulary/iso639-5/art art Artificial languages artificielles, langues +http://id.loc.gov/vocabulary/iso639-5/ath ath Athapascan languages athapascanes, langues +http://id.loc.gov/vocabulary/iso639-5/auf auf Arauan languages arauanes, langues +http://id.loc.gov/vocabulary/iso639-5/aus aus Australian languages australiennes, langues +http://id.loc.gov/vocabulary/iso639-5/awd awd Arawakan languages arawak, langues +http://id.loc.gov/vocabulary/iso639-5/azc azc Uto-Aztecan languages uto-aztèques, langues +http://id.loc.gov/vocabulary/iso639-5/bad bad Banda languages banda, langues +http://id.loc.gov/vocabulary/iso639-5/bai bai Bamileke languages bamiléké, langues +http://id.loc.gov/vocabulary/iso639-5/bat bat Baltic languages baltes, langues +http://id.loc.gov/vocabulary/iso639-5/ber ber Berber languages berbères, langues +http://id.loc.gov/vocabulary/iso639-5/bih bih Bihari languages langues biharis +http://id.loc.gov/vocabulary/iso639-5/bnt bnt Bantu languages bantou, langues +http://id.loc.gov/vocabulary/iso639-5/btk btk Batak languages batak, langues +http://id.loc.gov/vocabulary/iso639-5/cai cai Central American Indian languages amérindiennes de l'Amérique centrale, langues +http://id.loc.gov/vocabulary/iso639-5/cau cau Caucasian languages caucasiennes, langues +http://id.loc.gov/vocabulary/iso639-5/cba cba Chibchan languages chibcha, langues +http://id.loc.gov/vocabulary/iso639-5/ccn ccn North Caucasian languages caucasiennes du Nord, langues +http://id.loc.gov/vocabulary/iso639-5/ccs ccs South Caucasian languages caucasiennes du Sud, langues +http://id.loc.gov/vocabulary/iso639-5/cdc cdc Chadic languages tchadiques, langues +http://id.loc.gov/vocabulary/iso639-5/cdd cdd Caddoan languages caddoanes, langues +http://id.loc.gov/vocabulary/iso639-5/cel cel Celtic languages celtiques, langues; celtes, langues +http://id.loc.gov/vocabulary/iso639-5/cmc cmc Chamic languages chames, langues +http://id.loc.gov/vocabulary/iso639-5/cpe cpe Creoles and pidgins, English‑based créoles et pidgins basés sur l'anglais +http://id.loc.gov/vocabulary/iso639-5/cpf cpf Creoles and pidgins, French‑based créoles et pidgins basés sur le français +http://id.loc.gov/vocabulary/iso639-5/cpp cpp Creoles and pidgins, Portuguese-based créoles et pidgins basés sur le portugais +http://id.loc.gov/vocabulary/iso639-5/crp crp Creoles and pidgins créoles et pidgins +http://id.loc.gov/vocabulary/iso639-5/csu csu Central Sudanic languages soudaniques centrales, langues +http://id.loc.gov/vocabulary/iso639-5/cus cus Cushitic languages couchitiques, langues +http://id.loc.gov/vocabulary/iso639-5/day day Land Dayak languages dayak, langues +http://id.loc.gov/vocabulary/iso639-5/dmn dmn Mande languages mandé, langues +http://id.loc.gov/vocabulary/iso639-5/dra dra Dravidian languages dravidiennes, langues +http://id.loc.gov/vocabulary/iso639-5/egx egx Egyptian languages égyptiennes, langues +http://id.loc.gov/vocabulary/iso639-5/esx esx Eskimo-Aleut languages esquimaudes-aléoutiennes, langues +http://id.loc.gov/vocabulary/iso639-5/euq euq Basque (family) basque (famille) +http://id.loc.gov/vocabulary/iso639-5/fiu fiu Finno-Ugrian languages finno-ougriennes, langues +http://id.loc.gov/vocabulary/iso639-5/fox fox Formosan languages formosanes, langues +http://id.loc.gov/vocabulary/iso639-5/gem gem Germanic languages germaniques, langues +http://id.loc.gov/vocabulary/iso639-5/gme gme East Germanic languages germaniques orientales, langues +http://id.loc.gov/vocabulary/iso639-5/gmq gmq North Germanic languages germaniques septentrionales, langues +http://id.loc.gov/vocabulary/iso639-5/gmw gmw West Germanic languages germaniques occidentales, langues +http://id.loc.gov/vocabulary/iso639-5/grk grk Greek languages grecques, langues +http://id.loc.gov/vocabulary/iso639-5/hmx hmx Hmong-Mien languages hmong-mien, langues +http://id.loc.gov/vocabulary/iso639-5/hok hok Hokan languages hoka, langues +http://id.loc.gov/vocabulary/iso639-5/hyx hyx Armenian (family) arménien (famille) +http://id.loc.gov/vocabulary/iso639-5/iir iir Indo-Iranian languages indo-iraniennes, langues +http://id.loc.gov/vocabulary/iso639-5/ijo ijo Ijo languages ijo, langues +http://id.loc.gov/vocabulary/iso639-5/inc inc Indic languages indo-aryennes, langues +http://id.loc.gov/vocabulary/iso639-5/ine ine Indo-European languages indo-européennes, langues +http://id.loc.gov/vocabulary/iso639-5/ira ira Iranian languages iraniennes, langues +http://id.loc.gov/vocabulary/iso639-5/iro iro Iroquoian languages iroquoises, langues +http://id.loc.gov/vocabulary/iso639-5/itc itc Italic languages italiques, langues +http://id.loc.gov/vocabulary/iso639-5/jpx jpx Japanese (family) japonais (famille) +http://id.loc.gov/vocabulary/iso639-5/kar kar Karen languages karen, langues +http://id.loc.gov/vocabulary/iso639-5/kdo kdo Kordofanian languages kordofaniennes, langues +http://id.loc.gov/vocabulary/iso639-5/khi khi Khoisan languages khoïsan, langues +http://id.loc.gov/vocabulary/iso639-5/kro kro Kru languages krou, langues +http://id.loc.gov/vocabulary/iso639-5/map map Austronesian languages austronésiennes, langues +http://id.loc.gov/vocabulary/iso639-5/mkh mkh Mon-Khmer languages môn-khmer, langues +http://id.loc.gov/vocabulary/iso639-5/mno mno Manobo languages manobo, langues +http://id.loc.gov/vocabulary/iso639-5/mun mun Munda languages mounda, langues +http://id.loc.gov/vocabulary/iso639-5/myn myn Mayan languages maya, langues +http://id.loc.gov/vocabulary/iso639-5/nah nah Nahuatl languages nahuatl, langues +http://id.loc.gov/vocabulary/iso639-5/nai nai North American Indian languages nord-amérindiennes, langues +http://id.loc.gov/vocabulary/iso639-5/ngf ngf Trans-New Guinea languages trans-nouvelle-guinée, langues +http://id.loc.gov/vocabulary/iso639-5/nic nic Niger-Kordofanian languages nigéro-kordofaniennes, langues +http://id.loc.gov/vocabulary/iso639-5/nub nub Nubian languages nubiennes, langues +http://id.loc.gov/vocabulary/iso639-5/omq omq Oto-Manguean languages otomangue, langues +http://id.loc.gov/vocabulary/iso639-5/omv omv Omotic languages omotiques, langues +http://id.loc.gov/vocabulary/iso639-5/oto oto Otomian languages otomi, langues +http://id.loc.gov/vocabulary/iso639-5/paa paa Papuan languages papoues, langues +http://id.loc.gov/vocabulary/iso639-5/phi phi Philippine languages philippines, langues +http://id.loc.gov/vocabulary/iso639-5/plf plf Central Malayo-Polynesian languages malayo-polynésiennes centrales, langues +http://id.loc.gov/vocabulary/iso639-5/poz poz Malayo-Polynesian languages malayo-polynésiennes, langues +http://id.loc.gov/vocabulary/iso639-5/pqe pqe Eastern Malayo-Polynesian languages malayo-polynésiennes orientales, langues +http://id.loc.gov/vocabulary/iso639-5/pqw pqw Western Malayo-Polynesian languages malayo-polynésiennes occidentales, langues +http://id.loc.gov/vocabulary/iso639-5/pra pra Prakrit languages prâkrit, langues +http://id.loc.gov/vocabulary/iso639-5/qwe qwe Quechuan (family) quechua (famille) +http://id.loc.gov/vocabulary/iso639-5/roa roa Romance languages romanes, langues +http://id.loc.gov/vocabulary/iso639-5/sai sai South American Indian languages sud-amérindiennes, langues +http://id.loc.gov/vocabulary/iso639-5/sal sal Salishan languages salishennes, langues +http://id.loc.gov/vocabulary/iso639-5/sdv sdv Eastern Sudanic languages soudaniques orientales, langues +http://id.loc.gov/vocabulary/iso639-5/sem sem Semitic languages sémitiques, langues +http://id.loc.gov/vocabulary/iso639-5/sgn sgn sign languages langues des signes +http://id.loc.gov/vocabulary/iso639-5/sio sio Siouan languages sioux, langues +http://id.loc.gov/vocabulary/iso639-5/sit sit Sino-Tibetan languages sino-tibétaines, langues +http://id.loc.gov/vocabulary/iso639-5/sla sla Slavic languages slaves, langues +http://id.loc.gov/vocabulary/iso639-5/smi smi Sami languages sames, langues +http://id.loc.gov/vocabulary/iso639-5/son son Songhai languages songhai, langues +http://id.loc.gov/vocabulary/iso639-5/sqj sqj Albanian languages albanaises, langues +http://id.loc.gov/vocabulary/iso639-5/ssa ssa Nilo-Saharan languages nilo-sahariennes, langues +http://id.loc.gov/vocabulary/iso639-5/syd syd Samoyedic languages samoyèdes, langues +http://id.loc.gov/vocabulary/iso639-5/tai tai Tai languages tai, langues +http://id.loc.gov/vocabulary/iso639-5/tbq tbq Tibeto-Burman languages tibéto-birmanes, langues +http://id.loc.gov/vocabulary/iso639-5/trk trk Turkic languages turques, langues +http://id.loc.gov/vocabulary/iso639-5/tup tup Tupi languages tupi, langues +http://id.loc.gov/vocabulary/iso639-5/tut tut Altaic languages altaïques, langues +http://id.loc.gov/vocabulary/iso639-5/tuw tuw Tungus languages toungouses, langues +http://id.loc.gov/vocabulary/iso639-5/urj urj Uralic languages ouraliennes, langues +http://id.loc.gov/vocabulary/iso639-5/wak wak Wakashan languages wakashanes, langues +http://id.loc.gov/vocabulary/iso639-5/wen wen Sorbian languages sorabes, langues +http://id.loc.gov/vocabulary/iso639-5/xgn xgn Mongolian languages mongoles, langues +http://id.loc.gov/vocabulary/iso639-5/xnd xnd Na-Dene languages na-déné, langues +http://id.loc.gov/vocabulary/iso639-5/ypk ypk Yupik languages yupik, langues +http://id.loc.gov/vocabulary/iso639-5/zhx zhx Chinese (family) chinois (famille) +http://id.loc.gov/vocabulary/iso639-5/zle zle East Slavic languages slaves orientales, langues +http://id.loc.gov/vocabulary/iso639-5/zls zls South Slavic languages slaves méridionales, langues +http://id.loc.gov/vocabulary/iso639-5/zlw zlw West Slavic languages slaves occidentales, langues +http://id.loc.gov/vocabulary/iso639-5/znd znd Zande languages zandé, langues \ No newline at end of file diff --git a/lib/languages.rb b/lib/languages.rb index 0aea0cb..f9ed4fa 100644 --- a/lib/languages.rb +++ b/lib/languages.rb @@ -5,6 +5,7 @@ require_relative 'languages/version' require_relative 'languages/constants' require_relative 'languages/language' +require_relative 'languages/collective' # Provides living, extinct, ancient, historic, and constructed languages, specified in ISO 639-3 module Languages @@ -92,7 +93,10 @@ def load_tsv_data(filename) @@data = load_tsv_data('iso-639-3.tsv') # rubocop:disable Style/ClassVars .map { |row| row.to_h.transform_keys { |k| k.downcase.to_sym } } .each_with_object({}) { |l, h| h[l[:id].to_sym] = Language.new(l) } - .freeze + + collective_codes = load_tsv_data('iso639-5.tsv').to_h { |row| [row['code'].to_sym, Collective.new(row)] } + @@data.merge! collective_codes + @@data.freeze load_tsv_data('iso-639-3-macrolanguages.tsv') # Ignore deprecated mappings (i.e. row[2] = 'R') diff --git a/lib/languages/collective.rb b/lib/languages/collective.rb new file mode 100644 index 0000000..5fddb13 --- /dev/null +++ b/lib/languages/collective.rb @@ -0,0 +1,18 @@ +# frozen_string_literal: true + +module Languages + # Collective Language Code defined in ISO 639-5 + class Collective < Language + # rubocop:disable Lint/MissingSuper + def initialize(csv_attributes) + @iso639_3 = nil + @iso639_2b = csv_attributes['code']&.to_sym + @iso639_2t = csv_attributes['code']&.to_sym + @iso639_1 = nil + @scope = :collective + @type = :collective + @name = csv_attributes['Label (English)'] + end + # rubocop:enable Lint/MissingSuper + end +end diff --git a/lib/languages/language.rb b/lib/languages/language.rb index 5248ad8..385a973 100644 --- a/lib/languages/language.rb +++ b/lib/languages/language.rb @@ -22,7 +22,6 @@ def initialize(csv_attributes) # rubocop:disable Metrics/AbcSize, Metrics/Cyclom alias iso639_5 iso639_2 alias alpha2 iso639_1 - alias alpha3 iso639_3 alias alpha3_bibliographic iso639_2b alias alpha3_terminology iso639_2t @@ -58,5 +57,9 @@ def hash def <=>(other) other.iso639_3 <=> iso639_3 end + + def alpha3 + iso639_3 || iso639_2t || iso639_2b + end end end diff --git a/test/test_collective.rb b/test/test_collective.rb new file mode 100644 index 0000000..23787ac --- /dev/null +++ b/test/test_collective.rb @@ -0,0 +1,33 @@ +# frozen_string_literal: true + +require 'test_helper' + +class TestCollective < Minitest::Test + def setup + @collective = ::Languages::Collective.new({ + 'code' => 'nah', + 'Label (English)' => 'Nahuatl languages' + }) + end + + def test_that_it_has_a_name + assert_instance_of String, @collective.name + assert_equal 'Nahuatl languages', @collective.name + end + + def test_that_it_has_no_alpha2 + assert_nil @collective.alpha2 + end + + def test_it_has_alpha3_bibliographic + assert_equal :nah, @collective.alpha3_bibliographic + end + + def test_it_has_collective_type + assert_equal :collective, @collective.type + end + + def test_it_has_collective_scope + assert_equal :collective, @collective.scope + end +end diff --git a/test/test_languages.rb b/test/test_languages.rb index 1c0bcfa..7d17a67 100644 --- a/test/test_languages.rb +++ b/test/test_languages.rb @@ -2,6 +2,7 @@ require 'test_helper' +# rubocop:disable Metrics/ClassLength class TestLanguages < Minitest::Test def test_that_it_has_a_version_number refute_nil ::Languages::VERSION @@ -125,4 +126,17 @@ def test_macrolanguages_have_no_macrolanguage assert_empty(macrolanguages.reject { |l| l.macrolanguage.nil? }) end + + def test_iso_639_5_collective_codes_are_included + collective_codes = %i[aav afa alg alv apa aqa aql art ath auf aus awd azc bad bai bat ber bih bnt btk cai cau cba + ccn ccs cdc cdd cel cmc cpe cpf cpp crp csu cus day dmn dra egx esx euq fiu fox gem gme gmq + gmw grk hmx hok hyx iir ijo inc ine ira iro itc jpx kar kdo khi kro map mkh mno mun myn nah + nai ngf nic nub omq omv oto paa phi plf poz pqe pqw pra qwe roa sai sal sdv sem sgn sio sit + sla smi son sqj ssa syd wak wen xgn xnd ypk zhx zle zls zlw znd] + + assert(collective_codes.all? do |collective_code| + ::Languages.all.map(&:alpha3).include? collective_code + end) + end end +# rubocop:enable Metrics/ClassLength