Skip to content

Commit

Permalink
i49: Add collective codes from ISO 639-5
Browse files Browse the repository at this point in the history
  • Loading branch information
sandbergja committed Mar 8, 2023
1 parent 87f6680 commit 3a738dc
Show file tree
Hide file tree
Showing 9 changed files with 202 additions and 10 deletions.
2 changes: 1 addition & 1 deletion README.adoc
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ language = Language[:fr]
language.name # => French
language.alpha2 # => :fr (alias for #iso639_1)
language.alpha3 # => :fra (alias for #iso639_3)
language.alpha3 # => :fra
language.alpha3_bibliographic # => :fre (alias for #iso_639_2b)
language.alpha3_terminology # => :fra (alias for #iso_639_2t)
language.type # => :living
Expand Down
14 changes: 9 additions & 5 deletions bin/update-data
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,18 @@

require 'net/http'

def download_file(filename)
src = URI.parse("https://iso639-3.sil.org/sites/iso639-3/files/downloads/#{filename}.tab")
dest = "#{__dir__}/../data/#{filename}.tsv"
def download_file(url)
src = URI.parse(url)
filename = url.split('/').last.gsub('.tab', '.tsv')
dest = "#{__dir__}/../data/#{filename}"
File.write(dest, Net::HTTP.get(src))
end

# Download ISO 639-3 data
download_file('iso-639-3')
download_file('https://iso639-3.sil.org/sites/iso639-3/files/downloads/iso-639-3.tab')

# Download macrolanguage data
download_file('iso-639-3-macrolanguages')
download_file('https://iso639-3.sil.org/sites/iso639-3/files/downloads/iso-639-3-macrolanguages.tab')

# Download ISO 639-5 data
download_file('https://id.loc.gov/vocabulary/iso639-5.tsv')
116 changes: 116 additions & 0 deletions data/iso639-5.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
URI code Label (English) Label (French)
http://id.loc.gov/vocabulary/iso639-5/aav aav Austro-Asiatic languages austro-asiatiques, langues
http://id.loc.gov/vocabulary/iso639-5/afa afa Afro-Asiatic languages afro-asiatiques, langues
http://id.loc.gov/vocabulary/iso639-5/alg alg Algonquian languages algonquines, langues
http://id.loc.gov/vocabulary/iso639-5/alv alv Atlantic-Congo languages atlantique-congo, langues
http://id.loc.gov/vocabulary/iso639-5/apa apa Apache languages apaches, langues
http://id.loc.gov/vocabulary/iso639-5/aqa aqa Alacalufan languages alacalufanes, langues
http://id.loc.gov/vocabulary/iso639-5/aql aql Algic languages algiques, langues
http://id.loc.gov/vocabulary/iso639-5/art art Artificial languages artificielles, langues
http://id.loc.gov/vocabulary/iso639-5/ath ath Athapascan languages athapascanes, langues
http://id.loc.gov/vocabulary/iso639-5/auf auf Arauan languages arauanes, langues
http://id.loc.gov/vocabulary/iso639-5/aus aus Australian languages australiennes, langues
http://id.loc.gov/vocabulary/iso639-5/awd awd Arawakan languages arawak, langues
http://id.loc.gov/vocabulary/iso639-5/azc azc Uto-Aztecan languages uto-aztèques, langues
http://id.loc.gov/vocabulary/iso639-5/bad bad Banda languages banda, langues
http://id.loc.gov/vocabulary/iso639-5/bai bai Bamileke languages bamiléké, langues
http://id.loc.gov/vocabulary/iso639-5/bat bat Baltic languages baltes, langues
http://id.loc.gov/vocabulary/iso639-5/ber ber Berber languages berbères, langues
http://id.loc.gov/vocabulary/iso639-5/bih bih Bihari languages langues biharis
http://id.loc.gov/vocabulary/iso639-5/bnt bnt Bantu languages bantou, langues
http://id.loc.gov/vocabulary/iso639-5/btk btk Batak languages batak, langues
http://id.loc.gov/vocabulary/iso639-5/cai cai Central American Indian languages amérindiennes de l'Amérique centrale, langues
http://id.loc.gov/vocabulary/iso639-5/cau cau Caucasian languages caucasiennes, langues
http://id.loc.gov/vocabulary/iso639-5/cba cba Chibchan languages chibcha, langues
http://id.loc.gov/vocabulary/iso639-5/ccn ccn North Caucasian languages caucasiennes du Nord, langues
http://id.loc.gov/vocabulary/iso639-5/ccs ccs South Caucasian languages caucasiennes du Sud, langues
http://id.loc.gov/vocabulary/iso639-5/cdc cdc Chadic languages tchadiques, langues
http://id.loc.gov/vocabulary/iso639-5/cdd cdd Caddoan languages caddoanes, langues
http://id.loc.gov/vocabulary/iso639-5/cel cel Celtic languages celtiques, langues; celtes, langues
http://id.loc.gov/vocabulary/iso639-5/cmc cmc Chamic languages chames, langues
http://id.loc.gov/vocabulary/iso639-5/cpe cpe Creoles and pidgins, English‑based créoles et pidgins basés sur l'anglais
http://id.loc.gov/vocabulary/iso639-5/cpf cpf Creoles and pidgins, French‑based créoles et pidgins basés sur le français
http://id.loc.gov/vocabulary/iso639-5/cpp cpp Creoles and pidgins, Portuguese-based créoles et pidgins basés sur le portugais
http://id.loc.gov/vocabulary/iso639-5/crp crp Creoles and pidgins créoles et pidgins
http://id.loc.gov/vocabulary/iso639-5/csu csu Central Sudanic languages soudaniques centrales, langues
http://id.loc.gov/vocabulary/iso639-5/cus cus Cushitic languages couchitiques, langues
http://id.loc.gov/vocabulary/iso639-5/day day Land Dayak languages dayak, langues
http://id.loc.gov/vocabulary/iso639-5/dmn dmn Mande languages mandé, langues
http://id.loc.gov/vocabulary/iso639-5/dra dra Dravidian languages dravidiennes, langues
http://id.loc.gov/vocabulary/iso639-5/egx egx Egyptian languages égyptiennes, langues
http://id.loc.gov/vocabulary/iso639-5/esx esx Eskimo-Aleut languages esquimaudes-aléoutiennes, langues
http://id.loc.gov/vocabulary/iso639-5/euq euq Basque (family) basque (famille)
http://id.loc.gov/vocabulary/iso639-5/fiu fiu Finno-Ugrian languages finno-ougriennes, langues
http://id.loc.gov/vocabulary/iso639-5/fox fox Formosan languages formosanes, langues
http://id.loc.gov/vocabulary/iso639-5/gem gem Germanic languages germaniques, langues
http://id.loc.gov/vocabulary/iso639-5/gme gme East Germanic languages germaniques orientales, langues
http://id.loc.gov/vocabulary/iso639-5/gmq gmq North Germanic languages germaniques septentrionales, langues
http://id.loc.gov/vocabulary/iso639-5/gmw gmw West Germanic languages germaniques occidentales, langues
http://id.loc.gov/vocabulary/iso639-5/grk grk Greek languages grecques, langues
http://id.loc.gov/vocabulary/iso639-5/hmx hmx Hmong-Mien languages hmong-mien, langues
http://id.loc.gov/vocabulary/iso639-5/hok hok Hokan languages hoka, langues
http://id.loc.gov/vocabulary/iso639-5/hyx hyx Armenian (family) arménien (famille)
http://id.loc.gov/vocabulary/iso639-5/iir iir Indo-Iranian languages indo-iraniennes, langues
http://id.loc.gov/vocabulary/iso639-5/ijo ijo Ijo languages ijo, langues
http://id.loc.gov/vocabulary/iso639-5/inc inc Indic languages indo-aryennes, langues
http://id.loc.gov/vocabulary/iso639-5/ine ine Indo-European languages indo-européennes, langues
http://id.loc.gov/vocabulary/iso639-5/ira ira Iranian languages iraniennes, langues
http://id.loc.gov/vocabulary/iso639-5/iro iro Iroquoian languages iroquoises, langues
http://id.loc.gov/vocabulary/iso639-5/itc itc Italic languages italiques, langues
http://id.loc.gov/vocabulary/iso639-5/jpx jpx Japanese (family) japonais (famille)
http://id.loc.gov/vocabulary/iso639-5/kar kar Karen languages karen, langues
http://id.loc.gov/vocabulary/iso639-5/kdo kdo Kordofanian languages kordofaniennes, langues
http://id.loc.gov/vocabulary/iso639-5/khi khi Khoisan languages khoïsan, langues
http://id.loc.gov/vocabulary/iso639-5/kro kro Kru languages krou, langues
http://id.loc.gov/vocabulary/iso639-5/map map Austronesian languages austronésiennes, langues
http://id.loc.gov/vocabulary/iso639-5/mkh mkh Mon-Khmer languages môn-khmer, langues
http://id.loc.gov/vocabulary/iso639-5/mno mno Manobo languages manobo, langues
http://id.loc.gov/vocabulary/iso639-5/mun mun Munda languages mounda, langues
http://id.loc.gov/vocabulary/iso639-5/myn myn Mayan languages maya, langues
http://id.loc.gov/vocabulary/iso639-5/nah nah Nahuatl languages nahuatl, langues
http://id.loc.gov/vocabulary/iso639-5/nai nai North American Indian languages nord-amérindiennes, langues
http://id.loc.gov/vocabulary/iso639-5/ngf ngf Trans-New Guinea languages trans-nouvelle-guinée, langues
http://id.loc.gov/vocabulary/iso639-5/nic nic Niger-Kordofanian languages nigéro-kordofaniennes, langues
http://id.loc.gov/vocabulary/iso639-5/nub nub Nubian languages nubiennes, langues
http://id.loc.gov/vocabulary/iso639-5/omq omq Oto-Manguean languages otomangue, langues
http://id.loc.gov/vocabulary/iso639-5/omv omv Omotic languages omotiques, langues
http://id.loc.gov/vocabulary/iso639-5/oto oto Otomian languages otomi, langues
http://id.loc.gov/vocabulary/iso639-5/paa paa Papuan languages papoues, langues
http://id.loc.gov/vocabulary/iso639-5/phi phi Philippine languages philippines, langues
http://id.loc.gov/vocabulary/iso639-5/plf plf Central Malayo-Polynesian languages malayo-polynésiennes centrales, langues
http://id.loc.gov/vocabulary/iso639-5/poz poz Malayo-Polynesian languages malayo-polynésiennes, langues
http://id.loc.gov/vocabulary/iso639-5/pqe pqe Eastern Malayo-Polynesian languages malayo-polynésiennes orientales, langues
http://id.loc.gov/vocabulary/iso639-5/pqw pqw Western Malayo-Polynesian languages malayo-polynésiennes occidentales, langues
http://id.loc.gov/vocabulary/iso639-5/pra pra Prakrit languages prâkrit, langues
http://id.loc.gov/vocabulary/iso639-5/qwe qwe Quechuan (family) quechua (famille)
http://id.loc.gov/vocabulary/iso639-5/roa roa Romance languages romanes, langues
http://id.loc.gov/vocabulary/iso639-5/sai sai South American Indian languages sud-amérindiennes, langues
http://id.loc.gov/vocabulary/iso639-5/sal sal Salishan languages salishennes, langues
http://id.loc.gov/vocabulary/iso639-5/sdv sdv Eastern Sudanic languages soudaniques orientales, langues
http://id.loc.gov/vocabulary/iso639-5/sem sem Semitic languages sémitiques, langues
http://id.loc.gov/vocabulary/iso639-5/sgn sgn sign languages langues des signes
http://id.loc.gov/vocabulary/iso639-5/sio sio Siouan languages sioux, langues
http://id.loc.gov/vocabulary/iso639-5/sit sit Sino-Tibetan languages sino-tibétaines, langues
http://id.loc.gov/vocabulary/iso639-5/sla sla Slavic languages slaves, langues
http://id.loc.gov/vocabulary/iso639-5/smi smi Sami languages sames, langues
http://id.loc.gov/vocabulary/iso639-5/son son Songhai languages songhai, langues
http://id.loc.gov/vocabulary/iso639-5/sqj sqj Albanian languages albanaises, langues
http://id.loc.gov/vocabulary/iso639-5/ssa ssa Nilo-Saharan languages nilo-sahariennes, langues
http://id.loc.gov/vocabulary/iso639-5/syd syd Samoyedic languages samoyèdes, langues
http://id.loc.gov/vocabulary/iso639-5/tai tai Tai languages tai, langues
http://id.loc.gov/vocabulary/iso639-5/tbq tbq Tibeto-Burman languages tibéto-birmanes, langues
http://id.loc.gov/vocabulary/iso639-5/trk trk Turkic languages turques, langues
http://id.loc.gov/vocabulary/iso639-5/tup tup Tupi languages tupi, langues
http://id.loc.gov/vocabulary/iso639-5/tut tut Altaic languages altaïques, langues
http://id.loc.gov/vocabulary/iso639-5/tuw tuw Tungus languages toungouses, langues
http://id.loc.gov/vocabulary/iso639-5/urj urj Uralic languages ouraliennes, langues
http://id.loc.gov/vocabulary/iso639-5/wak wak Wakashan languages wakashanes, langues
http://id.loc.gov/vocabulary/iso639-5/wen wen Sorbian languages sorabes, langues
http://id.loc.gov/vocabulary/iso639-5/xgn xgn Mongolian languages mongoles, langues
http://id.loc.gov/vocabulary/iso639-5/xnd xnd Na-Dene languages na-déné, langues
http://id.loc.gov/vocabulary/iso639-5/ypk ypk Yupik languages yupik, langues
http://id.loc.gov/vocabulary/iso639-5/zhx zhx Chinese (family) chinois (famille)
http://id.loc.gov/vocabulary/iso639-5/zle zle East Slavic languages slaves orientales, langues
http://id.loc.gov/vocabulary/iso639-5/zls zls South Slavic languages slaves méridionales, langues
http://id.loc.gov/vocabulary/iso639-5/zlw zlw West Slavic languages slaves occidentales, langues
http://id.loc.gov/vocabulary/iso639-5/znd znd Zande languages zandé, langues
6 changes: 5 additions & 1 deletion lib/languages.rb
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
require_relative 'languages/version'
require_relative 'languages/constants'
require_relative 'languages/language'
require_relative 'languages/collective'

# Provides living, extinct, ancient, historic, and constructed languages, specified in ISO 639-3
module Languages
Expand Down Expand Up @@ -92,7 +93,10 @@ def load_tsv_data(filename)
@@data = load_tsv_data('iso-639-3.tsv') # rubocop:disable Style/ClassVars
.map { |row| row.to_h.transform_keys { |k| k.downcase.to_sym } }
.each_with_object({}) { |l, h| h[l[:id].to_sym] = Language.new(l) }
.freeze

collective_codes = load_tsv_data('iso639-5.tsv').to_h { |row| [row['code'].to_sym, Collective.new(row)] }
@@data.merge! collective_codes
@@data.freeze

load_tsv_data('iso-639-3-macrolanguages.tsv')
# Ignore deprecated mappings (i.e. row[2] = 'R')
Expand Down
18 changes: 18 additions & 0 deletions lib/languages/collective.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# frozen_string_literal: true

module Languages
# Collective Language Code defined in ISO 639-5
class Collective < Language
# rubocop:disable Lint/MissingSuper
def initialize(csv_attributes)
@iso639_3 = nil
@iso639_2b = csv_attributes['code']&.to_sym
@iso639_2t = csv_attributes['code']&.to_sym
@iso639_1 = nil
@scope = :collective
@type = :collective
@name = csv_attributes['Label (English)']
end
# rubocop:enable Lint/MissingSuper
end
end
4 changes: 2 additions & 2 deletions lib/languages/constants.rb
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# frozen_string_literal: true

module Languages
TYPES = %w[ancient constructed extinct historical living special].freeze
SCOPES = %w[individual macrolanguage special].freeze
TYPES = %w[ancient collective constructed extinct historical living special].freeze
SCOPES = %w[collective individual macrolanguage special].freeze
end
5 changes: 4 additions & 1 deletion lib/languages/language.rb
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@ def initialize(csv_attributes) # rubocop:disable Metrics/AbcSize, Metrics/Cyclom
alias iso639_5 iso639_2

alias alpha2 iso639_1
alias alpha3 iso639_3
alias alpha3_bibliographic iso639_2b
alias alpha3_terminology iso639_2t

Expand Down Expand Up @@ -58,5 +57,9 @@ def hash
def <=>(other)
other.iso639_3 <=> iso639_3
end

def alpha3
iso639_3 || iso639_2t || iso639_2b
end
end
end
33 changes: 33 additions & 0 deletions test/test_collective.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
# frozen_string_literal: true

require 'test_helper'

class TestCollective < Minitest::Test
def setup
@collective = ::Languages::Collective.new({
'code' => 'nah',
'Label (English)' => 'Nahuatl languages'
})
end

def test_that_it_has_a_name
assert_instance_of String, @collective.name
assert_equal 'Nahuatl languages', @collective.name
end

def test_that_it_has_no_alpha2
assert_nil @collective.alpha2
end

def test_it_has_alpha3_bibliographic
assert_equal :nah, @collective.alpha3_bibliographic
end

def test_it_has_collective_type
assert_equal :collective, @collective.type
end

def test_it_has_collective_scope
assert_equal :collective, @collective.scope
end
end
14 changes: 14 additions & 0 deletions test/test_languages.rb
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

require 'test_helper'

# rubocop:disable Metrics/ClassLength
class TestLanguages < Minitest::Test
def test_that_it_has_a_version_number
refute_nil ::Languages::VERSION
Expand Down Expand Up @@ -125,4 +126,17 @@ def test_macrolanguages_have_no_macrolanguage

assert_empty(macrolanguages.reject { |l| l.macrolanguage.nil? })
end

def test_iso_639_5_collective_codes_are_included
collective_codes = %i[aav afa alg alv apa aqa aql art ath auf aus awd azc bad bai bat ber bih bnt btk cai cau cba
ccn ccs cdc cdd cel cmc cpe cpf cpp crp csu cus day dmn dra egx esx euq fiu fox gem gme gmq
gmw grk hmx hok hyx iir ijo inc ine ira iro itc jpx kar kdo khi kro map mkh mno mun myn nah
nai ngf nic nub omq omv oto paa phi plf poz pqe pqw pra qwe roa sai sal sdv sem sgn sio sit
sla smi son sqj ssa syd wak wen xgn xnd ypk zhx zle zls zlw znd]

assert(collective_codes.all? do |collective_code|
::Languages.all.map(&:alpha3).include? collective_code
end)
end
end
# rubocop:enable Metrics/ClassLength

0 comments on commit 3a738dc

Please sign in to comment.