From 3a738dc8c59a7a2b6bc9d23f827222521eed4836 Mon Sep 17 00:00:00 2001 From: Jane Sandberg Date: Wed, 8 Mar 2023 15:32:05 -0800 Subject: [PATCH] i49: Add collective codes from ISO 639-5 --- README.adoc | 2 +- bin/update-data | 14 +++-- data/iso639-5.tsv | 116 ++++++++++++++++++++++++++++++++++++ lib/languages.rb | 6 +- lib/languages/collective.rb | 18 ++++++ lib/languages/constants.rb | 4 +- lib/languages/language.rb | 5 +- test/test_collective.rb | 33 ++++++++++ test/test_languages.rb | 14 +++++ 9 files changed, 202 insertions(+), 10 deletions(-) create mode 100644 data/iso639-5.tsv create mode 100644 lib/languages/collective.rb create mode 100644 test/test_collective.rb diff --git a/README.adoc b/README.adoc index 81906d1..6946474 100644 --- a/README.adoc +++ b/README.adoc @@ -104,7 +104,7 @@ language = Language[:fr] language.name # => French language.alpha2 # => :fr (alias for #iso639_1) -language.alpha3 # => :fra (alias for #iso639_3) +language.alpha3 # => :fra language.alpha3_bibliographic # => :fre (alias for #iso_639_2b) language.alpha3_terminology # => :fra (alias for #iso_639_2t) language.type # => :living diff --git a/bin/update-data b/bin/update-data index 3c4d54f..5db5ef7 100755 --- a/bin/update-data +++ b/bin/update-data @@ -3,14 +3,18 @@ require 'net/http' -def download_file(filename) - src = URI.parse("https://iso639-3.sil.org/sites/iso639-3/files/downloads/#{filename}.tab") - dest = "#{__dir__}/../data/#{filename}.tsv" +def download_file(url) + src = URI.parse(url) + filename = url.split('/').last.gsub('.tab', '.tsv') + dest = "#{__dir__}/../data/#{filename}" File.write(dest, Net::HTTP.get(src)) end # Download ISO 639-3 data -download_file('iso-639-3') +download_file('https://iso639-3.sil.org/sites/iso639-3/files/downloads/iso-639-3.tab') # Download macrolanguage data -download_file('iso-639-3-macrolanguages') +download_file('https://iso639-3.sil.org/sites/iso639-3/files/downloads/iso-639-3-macrolanguages.tab') + +# Download ISO 639-5 data +download_file('https://id.loc.gov/vocabulary/iso639-5.tsv') diff --git a/data/iso639-5.tsv b/data/iso639-5.tsv new file mode 100644 index 0000000..e0534c9 --- /dev/null +++ b/data/iso639-5.tsv @@ -0,0 +1,116 @@ +URI code Label (English) Label (French) +http://id.loc.gov/vocabulary/iso639-5/aav aav Austro-Asiatic languages austro-asiatiques, langues +http://id.loc.gov/vocabulary/iso639-5/afa afa Afro-Asiatic languages afro-asiatiques, langues +http://id.loc.gov/vocabulary/iso639-5/alg alg Algonquian languages algonquines, langues +http://id.loc.gov/vocabulary/iso639-5/alv alv Atlantic-Congo languages atlantique-congo, langues +http://id.loc.gov/vocabulary/iso639-5/apa apa Apache languages apaches, langues +http://id.loc.gov/vocabulary/iso639-5/aqa aqa Alacalufan languages alacalufanes, langues +http://id.loc.gov/vocabulary/iso639-5/aql aql Algic languages algiques, langues +http://id.loc.gov/vocabulary/iso639-5/art art Artificial languages artificielles, langues +http://id.loc.gov/vocabulary/iso639-5/ath ath Athapascan languages athapascanes, langues +http://id.loc.gov/vocabulary/iso639-5/auf auf Arauan languages arauanes, langues +http://id.loc.gov/vocabulary/iso639-5/aus aus Australian languages australiennes, langues +http://id.loc.gov/vocabulary/iso639-5/awd awd Arawakan languages arawak, langues +http://id.loc.gov/vocabulary/iso639-5/azc azc Uto-Aztecan languages uto-aztèques, langues +http://id.loc.gov/vocabulary/iso639-5/bad bad Banda languages banda, langues +http://id.loc.gov/vocabulary/iso639-5/bai bai Bamileke languages bamiléké, langues +http://id.loc.gov/vocabulary/iso639-5/bat bat Baltic languages baltes, langues +http://id.loc.gov/vocabulary/iso639-5/ber ber Berber languages berbères, langues +http://id.loc.gov/vocabulary/iso639-5/bih bih Bihari languages langues biharis +http://id.loc.gov/vocabulary/iso639-5/bnt bnt Bantu languages bantou, langues +http://id.loc.gov/vocabulary/iso639-5/btk btk Batak languages batak, langues +http://id.loc.gov/vocabulary/iso639-5/cai cai Central American Indian languages amérindiennes de l'Amérique centrale, langues +http://id.loc.gov/vocabulary/iso639-5/cau cau Caucasian languages caucasiennes, langues +http://id.loc.gov/vocabulary/iso639-5/cba cba Chibchan languages chibcha, langues +http://id.loc.gov/vocabulary/iso639-5/ccn ccn North Caucasian languages caucasiennes du Nord, langues +http://id.loc.gov/vocabulary/iso639-5/ccs ccs South Caucasian languages caucasiennes du Sud, langues +http://id.loc.gov/vocabulary/iso639-5/cdc cdc Chadic languages tchadiques, langues +http://id.loc.gov/vocabulary/iso639-5/cdd cdd Caddoan languages caddoanes, langues +http://id.loc.gov/vocabulary/iso639-5/cel cel Celtic languages celtiques, langues; celtes, langues +http://id.loc.gov/vocabulary/iso639-5/cmc cmc Chamic languages chames, langues +http://id.loc.gov/vocabulary/iso639-5/cpe cpe Creoles and pidgins, English‑based créoles et pidgins basés sur l'anglais +http://id.loc.gov/vocabulary/iso639-5/cpf cpf Creoles and pidgins, French‑based créoles et pidgins basés sur le français +http://id.loc.gov/vocabulary/iso639-5/cpp cpp Creoles and pidgins, Portuguese-based créoles et pidgins basés sur le portugais +http://id.loc.gov/vocabulary/iso639-5/crp crp Creoles and pidgins créoles et pidgins +http://id.loc.gov/vocabulary/iso639-5/csu csu Central Sudanic languages soudaniques centrales, langues +http://id.loc.gov/vocabulary/iso639-5/cus cus Cushitic languages couchitiques, langues +http://id.loc.gov/vocabulary/iso639-5/day day Land Dayak languages dayak, langues +http://id.loc.gov/vocabulary/iso639-5/dmn dmn Mande languages mandé, langues +http://id.loc.gov/vocabulary/iso639-5/dra dra Dravidian languages dravidiennes, langues +http://id.loc.gov/vocabulary/iso639-5/egx egx Egyptian languages égyptiennes, langues +http://id.loc.gov/vocabulary/iso639-5/esx esx Eskimo-Aleut languages esquimaudes-aléoutiennes, langues +http://id.loc.gov/vocabulary/iso639-5/euq euq Basque (family) basque (famille) +http://id.loc.gov/vocabulary/iso639-5/fiu fiu Finno-Ugrian languages finno-ougriennes, langues +http://id.loc.gov/vocabulary/iso639-5/fox fox Formosan languages formosanes, langues +http://id.loc.gov/vocabulary/iso639-5/gem gem Germanic languages germaniques, langues +http://id.loc.gov/vocabulary/iso639-5/gme gme East Germanic languages germaniques orientales, langues +http://id.loc.gov/vocabulary/iso639-5/gmq gmq North Germanic languages germaniques septentrionales, langues +http://id.loc.gov/vocabulary/iso639-5/gmw gmw West Germanic languages germaniques occidentales, langues +http://id.loc.gov/vocabulary/iso639-5/grk grk Greek languages grecques, langues +http://id.loc.gov/vocabulary/iso639-5/hmx hmx Hmong-Mien languages hmong-mien, langues +http://id.loc.gov/vocabulary/iso639-5/hok hok Hokan languages hoka, langues +http://id.loc.gov/vocabulary/iso639-5/hyx hyx Armenian (family) arménien (famille) +http://id.loc.gov/vocabulary/iso639-5/iir iir Indo-Iranian languages indo-iraniennes, langues +http://id.loc.gov/vocabulary/iso639-5/ijo ijo Ijo languages ijo, langues +http://id.loc.gov/vocabulary/iso639-5/inc inc Indic languages indo-aryennes, langues +http://id.loc.gov/vocabulary/iso639-5/ine ine Indo-European languages indo-européennes, langues +http://id.loc.gov/vocabulary/iso639-5/ira ira Iranian languages iraniennes, langues +http://id.loc.gov/vocabulary/iso639-5/iro iro Iroquoian languages iroquoises, langues +http://id.loc.gov/vocabulary/iso639-5/itc itc Italic languages italiques, langues +http://id.loc.gov/vocabulary/iso639-5/jpx jpx Japanese (family) japonais (famille) +http://id.loc.gov/vocabulary/iso639-5/kar kar Karen languages karen, langues +http://id.loc.gov/vocabulary/iso639-5/kdo kdo Kordofanian languages kordofaniennes, langues +http://id.loc.gov/vocabulary/iso639-5/khi khi Khoisan languages khoïsan, langues +http://id.loc.gov/vocabulary/iso639-5/kro kro Kru languages krou, langues +http://id.loc.gov/vocabulary/iso639-5/map map Austronesian languages austronésiennes, langues +http://id.loc.gov/vocabulary/iso639-5/mkh mkh Mon-Khmer languages môn-khmer, langues +http://id.loc.gov/vocabulary/iso639-5/mno mno Manobo languages manobo, langues +http://id.loc.gov/vocabulary/iso639-5/mun mun Munda languages mounda, langues +http://id.loc.gov/vocabulary/iso639-5/myn myn Mayan languages maya, langues +http://id.loc.gov/vocabulary/iso639-5/nah nah Nahuatl languages nahuatl, langues +http://id.loc.gov/vocabulary/iso639-5/nai nai North American Indian languages nord-amérindiennes, langues +http://id.loc.gov/vocabulary/iso639-5/ngf ngf Trans-New Guinea languages trans-nouvelle-guinée, langues +http://id.loc.gov/vocabulary/iso639-5/nic nic Niger-Kordofanian languages nigéro-kordofaniennes, langues +http://id.loc.gov/vocabulary/iso639-5/nub nub Nubian languages nubiennes, langues +http://id.loc.gov/vocabulary/iso639-5/omq omq Oto-Manguean languages otomangue, langues +http://id.loc.gov/vocabulary/iso639-5/omv omv Omotic languages omotiques, langues +http://id.loc.gov/vocabulary/iso639-5/oto oto Otomian languages otomi, langues +http://id.loc.gov/vocabulary/iso639-5/paa paa Papuan languages papoues, langues +http://id.loc.gov/vocabulary/iso639-5/phi phi Philippine languages philippines, langues +http://id.loc.gov/vocabulary/iso639-5/plf plf Central Malayo-Polynesian languages malayo-polynésiennes centrales, langues +http://id.loc.gov/vocabulary/iso639-5/poz poz Malayo-Polynesian languages malayo-polynésiennes, langues +http://id.loc.gov/vocabulary/iso639-5/pqe pqe Eastern Malayo-Polynesian languages malayo-polynésiennes orientales, langues +http://id.loc.gov/vocabulary/iso639-5/pqw pqw Western Malayo-Polynesian languages malayo-polynésiennes occidentales, langues +http://id.loc.gov/vocabulary/iso639-5/pra pra Prakrit languages prâkrit, langues +http://id.loc.gov/vocabulary/iso639-5/qwe qwe Quechuan (family) quechua (famille) +http://id.loc.gov/vocabulary/iso639-5/roa roa Romance languages romanes, langues +http://id.loc.gov/vocabulary/iso639-5/sai sai South American Indian languages sud-amérindiennes, langues +http://id.loc.gov/vocabulary/iso639-5/sal sal Salishan languages salishennes, langues +http://id.loc.gov/vocabulary/iso639-5/sdv sdv Eastern Sudanic languages soudaniques orientales, langues +http://id.loc.gov/vocabulary/iso639-5/sem sem Semitic languages sémitiques, langues +http://id.loc.gov/vocabulary/iso639-5/sgn sgn sign languages langues des signes +http://id.loc.gov/vocabulary/iso639-5/sio sio Siouan languages sioux, langues +http://id.loc.gov/vocabulary/iso639-5/sit sit Sino-Tibetan languages sino-tibétaines, langues +http://id.loc.gov/vocabulary/iso639-5/sla sla Slavic languages slaves, langues +http://id.loc.gov/vocabulary/iso639-5/smi smi Sami languages sames, langues +http://id.loc.gov/vocabulary/iso639-5/son son Songhai languages songhai, langues +http://id.loc.gov/vocabulary/iso639-5/sqj sqj Albanian languages albanaises, langues +http://id.loc.gov/vocabulary/iso639-5/ssa ssa Nilo-Saharan languages nilo-sahariennes, langues +http://id.loc.gov/vocabulary/iso639-5/syd syd Samoyedic languages samoyèdes, langues +http://id.loc.gov/vocabulary/iso639-5/tai tai Tai languages tai, langues +http://id.loc.gov/vocabulary/iso639-5/tbq tbq Tibeto-Burman languages tibéto-birmanes, langues +http://id.loc.gov/vocabulary/iso639-5/trk trk Turkic languages turques, langues +http://id.loc.gov/vocabulary/iso639-5/tup tup Tupi languages tupi, langues +http://id.loc.gov/vocabulary/iso639-5/tut tut Altaic languages altaïques, langues +http://id.loc.gov/vocabulary/iso639-5/tuw tuw Tungus languages toungouses, langues +http://id.loc.gov/vocabulary/iso639-5/urj urj Uralic languages ouraliennes, langues +http://id.loc.gov/vocabulary/iso639-5/wak wak Wakashan languages wakashanes, langues +http://id.loc.gov/vocabulary/iso639-5/wen wen Sorbian languages sorabes, langues +http://id.loc.gov/vocabulary/iso639-5/xgn xgn Mongolian languages mongoles, langues +http://id.loc.gov/vocabulary/iso639-5/xnd xnd Na-Dene languages na-déné, langues +http://id.loc.gov/vocabulary/iso639-5/ypk ypk Yupik languages yupik, langues +http://id.loc.gov/vocabulary/iso639-5/zhx zhx Chinese (family) chinois (famille) +http://id.loc.gov/vocabulary/iso639-5/zle zle East Slavic languages slaves orientales, langues +http://id.loc.gov/vocabulary/iso639-5/zls zls South Slavic languages slaves méridionales, langues +http://id.loc.gov/vocabulary/iso639-5/zlw zlw West Slavic languages slaves occidentales, langues +http://id.loc.gov/vocabulary/iso639-5/znd znd Zande languages zandé, langues \ No newline at end of file diff --git a/lib/languages.rb b/lib/languages.rb index 0aea0cb..f9ed4fa 100644 --- a/lib/languages.rb +++ b/lib/languages.rb @@ -5,6 +5,7 @@ require_relative 'languages/version' require_relative 'languages/constants' require_relative 'languages/language' +require_relative 'languages/collective' # Provides living, extinct, ancient, historic, and constructed languages, specified in ISO 639-3 module Languages @@ -92,7 +93,10 @@ def load_tsv_data(filename) @@data = load_tsv_data('iso-639-3.tsv') # rubocop:disable Style/ClassVars .map { |row| row.to_h.transform_keys { |k| k.downcase.to_sym } } .each_with_object({}) { |l, h| h[l[:id].to_sym] = Language.new(l) } - .freeze + + collective_codes = load_tsv_data('iso639-5.tsv').to_h { |row| [row['code'].to_sym, Collective.new(row)] } + @@data.merge! collective_codes + @@data.freeze load_tsv_data('iso-639-3-macrolanguages.tsv') # Ignore deprecated mappings (i.e. row[2] = 'R') diff --git a/lib/languages/collective.rb b/lib/languages/collective.rb new file mode 100644 index 0000000..5fddb13 --- /dev/null +++ b/lib/languages/collective.rb @@ -0,0 +1,18 @@ +# frozen_string_literal: true + +module Languages + # Collective Language Code defined in ISO 639-5 + class Collective < Language + # rubocop:disable Lint/MissingSuper + def initialize(csv_attributes) + @iso639_3 = nil + @iso639_2b = csv_attributes['code']&.to_sym + @iso639_2t = csv_attributes['code']&.to_sym + @iso639_1 = nil + @scope = :collective + @type = :collective + @name = csv_attributes['Label (English)'] + end + # rubocop:enable Lint/MissingSuper + end +end diff --git a/lib/languages/constants.rb b/lib/languages/constants.rb index 5aefdf4..74237d4 100644 --- a/lib/languages/constants.rb +++ b/lib/languages/constants.rb @@ -1,6 +1,6 @@ # frozen_string_literal: true module Languages - TYPES = %w[ancient constructed extinct historical living special].freeze - SCOPES = %w[individual macrolanguage special].freeze + TYPES = %w[ancient collective constructed extinct historical living special].freeze + SCOPES = %w[collective individual macrolanguage special].freeze end diff --git a/lib/languages/language.rb b/lib/languages/language.rb index 5248ad8..385a973 100644 --- a/lib/languages/language.rb +++ b/lib/languages/language.rb @@ -22,7 +22,6 @@ def initialize(csv_attributes) # rubocop:disable Metrics/AbcSize, Metrics/Cyclom alias iso639_5 iso639_2 alias alpha2 iso639_1 - alias alpha3 iso639_3 alias alpha3_bibliographic iso639_2b alias alpha3_terminology iso639_2t @@ -58,5 +57,9 @@ def hash def <=>(other) other.iso639_3 <=> iso639_3 end + + def alpha3 + iso639_3 || iso639_2t || iso639_2b + end end end diff --git a/test/test_collective.rb b/test/test_collective.rb new file mode 100644 index 0000000..23787ac --- /dev/null +++ b/test/test_collective.rb @@ -0,0 +1,33 @@ +# frozen_string_literal: true + +require 'test_helper' + +class TestCollective < Minitest::Test + def setup + @collective = ::Languages::Collective.new({ + 'code' => 'nah', + 'Label (English)' => 'Nahuatl languages' + }) + end + + def test_that_it_has_a_name + assert_instance_of String, @collective.name + assert_equal 'Nahuatl languages', @collective.name + end + + def test_that_it_has_no_alpha2 + assert_nil @collective.alpha2 + end + + def test_it_has_alpha3_bibliographic + assert_equal :nah, @collective.alpha3_bibliographic + end + + def test_it_has_collective_type + assert_equal :collective, @collective.type + end + + def test_it_has_collective_scope + assert_equal :collective, @collective.scope + end +end diff --git a/test/test_languages.rb b/test/test_languages.rb index 1c0bcfa..7d17a67 100644 --- a/test/test_languages.rb +++ b/test/test_languages.rb @@ -2,6 +2,7 @@ require 'test_helper' +# rubocop:disable Metrics/ClassLength class TestLanguages < Minitest::Test def test_that_it_has_a_version_number refute_nil ::Languages::VERSION @@ -125,4 +126,17 @@ def test_macrolanguages_have_no_macrolanguage assert_empty(macrolanguages.reject { |l| l.macrolanguage.nil? }) end + + def test_iso_639_5_collective_codes_are_included + collective_codes = %i[aav afa alg alv apa aqa aql art ath auf aus awd azc bad bai bat ber bih bnt btk cai cau cba + ccn ccs cdc cdd cel cmc cpe cpf cpp crp csu cus day dmn dra egx esx euq fiu fox gem gme gmq + gmw grk hmx hok hyx iir ijo inc ine ira iro itc jpx kar kdo khi kro map mkh mno mun myn nah + nai ngf nic nub omq omv oto paa phi plf poz pqe pqw pra qwe roa sai sal sdv sem sgn sio sit + sla smi son sqj ssa syd wak wen xgn xnd ypk zhx zle zls zlw znd] + + assert(collective_codes.all? do |collective_code| + ::Languages.all.map(&:alpha3).include? collective_code + end) + end end +# rubocop:enable Metrics/ClassLength