Skip to content

Commit cbbfb64

Browse files
committed
added some wiki processing changes
1 parent ee8988e commit cbbfb64

18 files changed

+40190
-15
lines changed

Data-Source/Data-Wiki/ful/AA/wiki_00

Lines changed: 2259 additions & 0 deletions
Large diffs are not rendered by default.

Data-Source/Data-Wiki/hau/AA/wiki_00

Lines changed: 12118 additions & 0 deletions
Large diffs are not rendered by default.

Data-Source/Data-Wiki/ibo/AA/wiki_00

Lines changed: 6740 additions & 0 deletions
Large diffs are not rendered by default.

Data-Source/Data-Wiki/nav/AA/wiki_00

Lines changed: 11952 additions & 0 deletions
Large diffs are not rendered by default.

Data-Source/Data-Wiki/nav/AA/wiki_01

Lines changed: 7003 additions & 0 deletions
Large diffs are not rendered by default.

Dependencies/wikipedia-process.bash

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -79,24 +79,28 @@ pushd $DIR_WIKI_DATA
7979
find * -maxdepth 0 -type d \( ! -iname ".*" \) >> "$HOME_FOLDER"/$WIKI_LANGUAGES
8080
popd
8181

82+
# For each item/line in Wiki_languages find out if the line already exists in $LANGUAGE_LIST_FILE and if not append it.
83+
8284
for i in $(cat $WIKI_LANGUAGES);do
83-
grep -Fxq "$i" $LANGUAGE_LIST_FILE || echo $i >> $LANGUAGE_LIST_FILE
84-
grep -Fxq "$i" $CORPORA_LANGUAGES || echo $i >> $CORPORA_LANGUAGES
85+
grep -Fxq "$i" $LANGUAGE_LIST_FILE || echo "$i" >> $LANGUAGE_LIST_FILE #There is a bug here and I can not seem to pass data into this$LANGUAGE_LIST_FILE.
86+
grep -Fxq "$i" $CORPORA_LANGUAGES || echo "$i" >> $CORPORA_LANGUAGES
8587
done
8688

8789

8890
# Set the Variables.
91+
92+
#turn the list into a long list (array) with out new lines instead of a tall list
8993
WIKI_LANGUAGESString=$(cat $WIKI_LANGUAGES | tr "\n" " ")
90-
WIKI_LANGUAGES_ARRAY=($WIKI_LANGUAGESString) #There is a bug here (or at least a bad programming practice). The file veriable has one name and the same name is used later for a different meaning. Fixed on 15 July 2015 by adding "_ARRAY at the end of the variable name".
94+
WIKI_LANGUAGES_ARRAY=($WIKI_LANGUAGESString)
9195

9296
# This section needs to be modified and allow the arangement of info
9397
# to be corpus by type: Wikpedia/James or Language Navajo/ibgo
9498

95-
echo "INFO: It looks like we were able to extract ${#WIKI_LANGUAGES_ARRAY[@]} Wikipedia based corpora."
99+
echo "INFO: It looks like we were able to extract ${#WIKI_LANGUAGES_ARRAY[@]} Wikipedia based corpora." #There is a bug here and I can\'t seem to find out why the data is not being passed correctly. The same thing is happening in James.
96100
echo " Including the following languages: ${WIKI_LANGUAGES_ARRAY[*]}"
97101
echo
98102

99-
exit;0
103+
100104

101105
# Take the languages from Wikipedia and append them to the master language list; making sure not to add duplicates
102106

@@ -108,8 +112,8 @@ exit;0
108112
# Some_third_array_count=$((${#Some_array[*]} + ${#Some_other_array[*]}))
109113

110114

111-
csvfix unique Wikipedia_Languages.txt $LANGUAGE_LIST_FILE | csvfix write_dsv -s ' ' -o $LANGUAGE_LIST_FILE
112-
115+
csvfix unique /Temp-Files/Languages-Used/Wikipedia_Languages.txt $LANGUAGE_LIST_FILE | csvfix write_dsv -s ' ' -o $LANGUAGE_LIST_FILE
116+
exit;0
113117

114118
# Set the Variables.
115119
LANGUAGE_IDString=$(cat $LANGUAGE_LIST_FILE | tr "\n" " ")

awesome-script.bash

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -170,6 +170,7 @@ for i in $(cat $JAMES_LIST_FILE); do
170170
done
171171

172172
# Take the languages from James and add them to the master language list.
173+
echo $(cat $JAMES_LANGUAGES)
173174
for i in $(cat $JAMES_LANGUAGES);do
174175
grep -Fxq "$i" $LANGUAGE_LIST_FILE || echo $i >> $LANGUAGE_LIST_FILE
175176
grep -Fxq "$i" $CORPORA_LANGUAGES || echo $i >> $CORPORA_LANGUAGES
@@ -181,9 +182,8 @@ done
181182
JAMES_LANGUAGESString=$(cat $JAMES_LANGUAGES | tr "\n" " ")
182183
JAMES_LANGUAGES_ARRAY=($JAMES_LANGUAGESString) #There is a bug here (or at least a bad programming practice). The file veriable has one name and the same name is used later for a different meaning. Fixed on 15 July 2015 by adding "_ARRAY at the end of the variable name".
183184

184-
185185
## These are not reading as arrays. Rather they are reading as literals or paths OR they are reading as just the first line of the file.
186-
echo "INFO: It looks like altogether we found: ${#JAMES_LANGUAGES_ARRAY[@]} James based corpora."
186+
echo "INFO: It looks like altogether we found: ${#JAMES_LANGUAGESString[@]} James based corpora."
187187
echo " Including the following languages: ${JAMES_LANGUAGES_ARRAY[*]}"
188188
echo
189189

example-ori-corpus-james-asg.txt

Lines changed: 40 additions & 0 deletions
Large diffs are not rendered by default.

example-ori-corpus-james-bkv.txt

Lines changed: 1 addition & 0 deletions
Large diffs are not rendered by default.

example-ori-corpus-james-deu.txt

Lines changed: 22 additions & 0 deletions
Large diffs are not rendered by default.

0 commit comments

Comments
 (0)