Skip to content

Commit

Permalink
Merge branch 'clarin-eric:data' into data
Browse files Browse the repository at this point in the history
  • Loading branch information
DimitrisGk-iel authored Oct 9, 2023
2 parents ea6bfde + d38275c commit 3ce4561
Show file tree
Hide file tree
Showing 475 changed files with 142,462 additions and 104,263 deletions.
5 changes: 4 additions & 1 deletion .github/actions/ParlaMintValidate/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,12 @@ inputs:
parlas:
description: 'list of parliament codes'
required: true
requireTaxonomiesTranslations:
description: 'require every term in common taxonomies to be translated'
default: '1'
runs:
using: "composite"
steps:
- name: Convert and Validate
run: ${{ github.action_path }}/validate.sh '${{inputs.parlas}}'
run: ${{ github.action_path }}/validate.sh '${{inputs.parlas}}' '${{inputs.requireTaxonomiesTranslations}}'
shell: bash
16 changes: 16 additions & 0 deletions .github/actions/ParlaMintValidate/validate.sh
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,18 @@ for parla in $(jq -r '.[]' <<< $1 ); do
echo "Cleaning old sample files [$parla]"
rm -f ${DATADIR}/ParlaMint-$parla/ParlaMint-*.{txt,tsv,conllu,vert}

if [ $2 = '1' ] ; then
echo "INFO check whether are taxonomies translated"
make translateTaxonomies-$parla | sed "s/^\(.*\)\(\berror\b\)/::error::\1\2/i" | tee $DIR/taxonomies.log
make initTaxonomies-$parla
echo "INFO overwriting taxonomies that are expected to be translated"
make initTaxonomies4translation-$parla
make validateTaxonomies-$parla | sed "s/^\(.*\)\(\berror\b\)/::error:: incomplete taxonomy translation \1\2/i" | tee $DIR/taxonomies.log
else
echo "::warning:: INFO initialize taxonomies with no translations - check if correct(known) ids has been used"
make initTaxonomies-$parla
fi

if [ -f "${DATADIR}/ParlaMint-$parla/ParlaMint-$parla.xml" ] ; then

( Scripts/validate-parlamint.pl Schema ${DATADIR}/ParlaMint-$parla 2>&1 || echo "ERROR: validate-parlamint.pl exited with <> 0" ) \
Expand Down Expand Up @@ -55,6 +67,10 @@ for parla in $(jq -r '.[]' <<< $1 ); do
FAIL=1
echo "::error:: ParlaMint-$parla validation failed"
fi

echo "::warning:: TMP restore taxonomy"
git checkout Corpora/Taxonomies/ParlaMint-taxonomy*
git checkout ${DATADIR}/ParlaMint-$parla/ParlaMint-taxonomy*
done

if [ $FAIL -eq 1 ] ; then
Expand Down
1 change: 1 addition & 0 deletions .github/workflows/validate.yml
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ jobs:
uses: ./ParlaMint/.github/actions/ParlaMintValidate
with:
parlas: '["${{matrix.parla}}"]'
requireTaxonomiesTranslations: '${{ vars.REQUIRE_TRANSLATIONS }}'
Validate:
runs-on: ubuntu-latest
needs: [ValidateCountries]
Expand Down
128 changes: 78 additions & 50 deletions Corpora/Makefile
Original file line number Diff line number Diff line change
@@ -1,43 +1,3 @@
## Transliteration tests
test-translit4:
${vlp} ParlaMint-GR-listPerson.xml
${vlo} ParlaMint-GR-listOrg.xml
test-translit3:
../Scripts/trans-execute.pl Sources-TEI/ParlaMint-BG.TEI/ParlaMint-BG-listPerson.xml ParlaMint-BG-listPerson.xml
../Scripts/trans-execute.pl Sources-TEI/ParlaMint-BG.TEI/ParlaMint-BG-listOrg.xml ParlaMint-BG-listOrg.xml

../Scripts/trans-execute.pl Sources-TEI/ParlaMint-GR.TEI/ParlaMint-GR-listPerson.xml ParlaMint-GR-listPerson.xml
../Scripts/trans-execute.pl Sources-TEI/ParlaMint-GR.TEI/ParlaMint-GR-listOrg.xml ParlaMint-GR-listOrg.xml

../Scripts/trans-execute.pl Sources-TEI/ParlaMint-UA.TEI/ParlaMint-UA-listPerson.xml ParlaMint-UA-listPerson.xml
../Scripts/trans-execute.pl Sources-TEI/ParlaMint-UA.TEI/ParlaMint-UA-listOrg.xml ParlaMint-UA-listOrg.xml
test-translit2:
bin/trans-execute.pl Sources-TEI/ParlaMint-GR.TEI/ParlaMint-GR-listOrg.xml ParlaMint-GR-listOrg.xml
test-translit1:
$s -xsl:bin/trans-tei2tsv.xsl Sources-TEI/ParlaMint-BG.TEI/ParlaMint-BG-listOrg.xml > ParlaMint-BG-listOrg.tsv


######## Merging taxonomies

TAXONOMIES-TEI = subcorpus speaker_types parla.legislature
TAXONOMIES-ANA = NER

merge-taxos-nohup:
nohup time make merge-taxos 2> Taxonomies/ParlaMint-taxonomy-merge.log > Logs/ParlaMint-taxonomy.log &

merge-taxos:
for TAXONOMY in ${TAXONOMIES-TEI}; do \
$s template=../Corpora/Taxonomies/ParlaMint-taxonomy-$${TAXONOMY}.template.xml \
-xsl:../Scripts/parlamint-merge-taxonomy.xsl Master/ParlaMint.xml \
> Taxonomies/ParlaMint-taxonomy-$${TAXONOMY}.xml; \
done;
for TAXONOMY in ${TAXONOMIES-ANA}; do \
$s template=../Corpora/Taxonomies/ParlaMint-taxonomy-$${TAXONOMY}.ana.template.xml \
-xsl:../Scripts/parlamint-merge-taxonomy.xsl Master/ParlaMint.ana.xml \
> Taxonomies/ParlaMint-taxonomy-$${TAXONOMY}.ana.xml; \
done;
${vta} Taxonomies/ParlaMint-taxonomy-*.xml

############### Makefile for making a distributable version of the ParlaMint and ParlaMint-en corpora

### VARIABLES
Expand All @@ -50,13 +10,14 @@ merge-taxos:
#CORPORA = AT BA BE BG CZ DK EE ES ES-CT ES-GA FR GB GR HR HU IS IT LV NL NO PL PT RS SE SI TR UA

# Partial runs:
CORPORA = AT
#CORPORA = GR BG UA
CORPORA = FI

######## MTed CORPORA FOR V 3.1
MT-CORPORA = AT-en BA-en BE-en BG-en CZ-en DK-en EE-en ES-en ES-CT-en ES-GA-en FR-en GR-en HR-en HU-en IS-en IT-en LV-en NL-en NO-en PL-en PT-en RS-en SE-en SI-en TR-en UA-en

# Used in test targets:
CORPUS = UA
CORPUS = UK

#Where things are, as we use several branches: this one (most likely dev), and documentation
PARLAMINT = /project/corpora/Parla/ParlaMint/ParlaMint
Expand All @@ -66,7 +27,7 @@ TEMP = ${HERE}/Temp

#Where the submitted corpora are found (ParlaMint- .TEI/ and .TEI.ana/
SOURCES = ${HERE}/Sources-TEI
SOURCES-MT = ${HERE}/Sources-MT
SOURCES-MT = ${HERE}/Sources-Sem

# Version number and PID of next(!) TEI and TEI.ana ParlaMint release
VERSION = 3.1
Expand All @@ -82,13 +43,80 @@ WEB = [email protected]:/home/tomaz/www/tmp/ParlaMint/

###### Targets

###### Tests
test-tei1:
${FINALIZE} -tei -valid -codes FI -in ${HERE}/Temp -out ${HERE}/Temp/Out
test-vert4:
$s meta=${HERE}/Master/ParlaMint-BG.TEI.ana/ParlaMint-BG.ana.xml -xsl:../Scripts/parlamint2xmlvert.xsl \
${HERE}/Master/ParlaMint-BG.TEI.ana/2014/ParlaMint-BG_2014-10-27.ana.xml > test.vert
test-vert3:
$s meta=${HERE}/Master/ParlaMint-UA.TEI.ana/ParlaMint-UA.ana.xml -xsl:../Scripts/parlamint2xmlvert.xsl \
${HERE}/Master/ParlaMint-UA.TEI.ana/2012/ParlaMint-UA_2012-12-04-m0.ana.xml > test.vert
test-vert2:
${FINALIZE} -vert -codes ES-PV -in ${HERE}/Master -out ${HERE}/Master
test-vert1:
${FINALIZE} -vert -codes GR -in ${HERE}/Temp -out ${HERE}/Temp
${FINALIZE} -vert -codes GR-en -in ${HERE}/Temp -out ${HERE}/Temp
test-conll3:
${FINALIZE} -conll -codes GR-en -in ${HERE}/Temp -out ${HERE}/Temp
test-conll2:
${FINALIZE} -conll -codes GR -in ${HERE}/Temp -out ${HERE}/Temp
test-conll1:
${FINALIZE} -conll -codes SI -in ${HERE}/Master -out ${HERE}/Master
test-meta2:
$s meta=../Corpora/Master/ParlaMint-BE.TEI/ParlaMint-BE.xml -xsl:../Scripts/parlamint2meta.xsl \
../Corpora/Master/ParlaMint-BE.TEI/2014/ParlaMint-BE_2014-06-19-54-plenair-ip001x.xml > test.tsv
test-meta1:
${FINALIZE} -txt -codes GR -in ${HERE}/Temp -out ${HERE}/Temp
${FINALIZE} -txt -codes GR-en -in ${HERE}/Temp -out ${HERE}/Temp

## Transliteration tests
test-translit4:
${vlp} ParlaMint-GR-listPerson.xml
${vlo} ParlaMint-GR-listOrg.xml
test-translit3:
../Scripts/trans-execute.pl Sources-TEI/ParlaMint-BG.TEI/ParlaMint-BG-listPerson.xml ParlaMint-BG-listPerson.xml
../Scripts/trans-execute.pl Sources-TEI/ParlaMint-BG.TEI/ParlaMint-BG-listOrg.xml ParlaMint-BG-listOrg.xml

../Scripts/trans-execute.pl Sources-TEI/ParlaMint-GR.TEI/ParlaMint-GR-listPerson.xml ParlaMint-GR-listPerson.xml
../Scripts/trans-execute.pl Sources-TEI/ParlaMint-GR.TEI/ParlaMint-GR-listOrg.xml ParlaMint-GR-listOrg.xml

../Scripts/trans-execute.pl Sources-TEI/ParlaMint-UA.TEI/ParlaMint-UA-listPerson.xml ParlaMint-UA-listPerson.xml
../Scripts/trans-execute.pl Sources-TEI/ParlaMint-UA.TEI/ParlaMint-UA-listOrg.xml ParlaMint-UA-listOrg.xml
test-translit2:
bin/trans-execute.pl Sources-TEI/ParlaMint-GR.TEI/ParlaMint-GR-listOrg.xml ParlaMint-GR-listOrg.xml
test-translit1:
$s -xsl:bin/trans-tei2tsv.xsl Sources-TEI/ParlaMint-BG.TEI/ParlaMint-BG-listOrg.xml > ParlaMint-BG-listOrg.tsv


######## Merging taxonomies

TAXONOMIES-TEI = subcorpus speaker_types parla.legislature
TAXONOMIES-ANA = NER

merge-taxos-nohup:
nohup time make merge-taxos 2> Taxonomies/ParlaMint-taxonomy-merge.log > Logs/ParlaMint-taxonomy.log &

merge-taxos:
for TAXONOMY in ${TAXONOMIES-TEI}; do \
$s template=../Corpora/Taxonomies/ParlaMint-taxonomy-$${TAXONOMY}.template.xml \
-xsl:../Scripts/parlamint-merge-taxonomy.xsl Master/ParlaMint.xml \
> Taxonomies/ParlaMint-taxonomy-$${TAXONOMY}.xml; \
done;
for TAXONOMY in ${TAXONOMIES-ANA}; do \
$s template=../Corpora/Taxonomies/ParlaMint-taxonomy-$${TAXONOMY}.ana.template.xml \
-xsl:../Scripts/parlamint-merge-taxonomy.xsl Master/ParlaMint.ana.xml \
> Taxonomies/ParlaMint-taxonomy-$${TAXONOMY}.ana.xml; \
done;
${vta} Taxonomies/ParlaMint-taxonomy-*.xml

### Fixes for 3.1-en:

# Instead of TEI-derived CoNLL-U files we release original MTed CoNLL-U
# because they also contain word alignments
# Instead of TEI-derived CoNLL-U files we release MTed CoNLL-U
# merged with TEI-derived CoNLL-U because they also contain word alignments
# Script also adds -en suffix to filesnames + readme.
mt-cp-conllu:
bin/cp-conllu.pl validate '${SOURCES-MT}/ParlaMint-*-en.conllu' 'Master'
bin/cp-conllu.pl Master validate '${SOURCES-MT}/ParlaMint-*-en.conllu' 'Master'

# Make txt and tsv files with tsvs
mt-convert-txt:
Expand Down Expand Up @@ -249,9 +277,9 @@ mt-convert:

mt-test8:
$s -xsl:../Scripts/validate-parlamint.xsl \
${HERE}/Master/ParlaMint-AT-en.TEI.ana/ParlaMint-AT-en.ana.xml
$s meta=${HERE}/Master/ParlaMint-AT-en.TEI.ana/ParlaMint-AT-en.ana.xml -xsl:../Scripts/validate-parlamint.xsl \
${HERE}/Master/ParlaMint-AT-en.TEI.ana/2022/ParlaMint-AT-en_2022-01-20-027-XXVII-NRSITZ-00139.ana.xml
${HERE}/Master/ParlaMint-UA.TEI.ana/ParlaMint-UA.ana.xml
$s meta=${HERE}/Master/ParlaMint-UA.TEI.ana/ParlaMint-UA.ana.xml -xsl:../Scripts/validate-parlamint.xsl \
${HERE}/Master/ParlaMint-UA.TEI.ana/2022/ParlaMint-UA_2022-01-25-m0.ana.xml
mt-test7:
$s meta=${HERE}/Master/ParlaMint-AT-en.TEI.ana/ParlaMint-AT-en.ana.xml -xsl:../Scripts//check-links.xsl \
${HERE}/Master/ParlaMint-AT-en.TEI.ana/2022/ParlaMint-AT-en_2022-01-20-027-XXVII-NRSITZ-00139.ana.xml
Expand All @@ -276,7 +304,7 @@ mt-test2a:
perl ../Scripts/conllu2tei.pl < Test/0.conllu > Test/0.body.xml
mt-test2:
perl ../Scripts/conllu2tei.pl \
< ${SOURCES-MT}/ParlaMint-LV-en.conllu/2015/ParlaMint-LV_2015-11-12-PT12-329.conllu \
< ${SOURCES-MT}/USAS/ParlaMint-LV-en.conllu/2015/ParlaMint-LV-en_2015-11-12-PT12-329.conllu \
> Test/ParlaMint-LV_2015-11-12-PT12-329.body.xml
xmllint --noout Test/ParlaMint-LV_2015-11-12-PT12-329.body.xml
mt-test1:
Expand Down
2 changes: 2 additions & 0 deletions Corpora/Master/ParlaMint.ana.xml
Original file line number Diff line number Diff line change
Expand Up @@ -2913,6 +2913,8 @@
href="ParlaMint-ES-GA.TEI.ana/ParlaMint-ES-GA.ana.xml"/>
<xi:include xmlns:xi="http://www.w3.org/2001/XInclude"
href="ParlaMint-ES.TEI.ana/ParlaMint-ES.ana.xml"/>
<xi:include xmlns:xi="http://www.w3.org/2001/XInclude"
href="ParlaMint-FI.TEI.ana/ParlaMint-FI.ana.xml"/>
<xi:include xmlns:xi="http://www.w3.org/2001/XInclude"
href="ParlaMint-FR.TEI.ana/ParlaMint-FR.ana.xml"/>
<xi:include xmlns:xi="http://www.w3.org/2001/XInclude"
Expand Down
Loading

0 comments on commit 3ce4561

Please sign in to comment.