Skip to content

Commit 3386c1f

Browse files
„feldmueller“kupietz
authored andcommitted
update functions and tests to align with RKorAPClient 0.9.0
Change-Id: I0221c6cc0b9180bc83feb96651e0a5f204846451
1 parent e8c7adc commit 3386c1f

File tree

4 files changed

+65
-4
lines changed

4 files changed

+65
-4
lines changed

CHANGELOG.md

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,15 @@
11
# Version history
22

3+
## 0.9.0
4+
5+
- Updates recommended RKorAPClient version to 0.9.0
6+
- Added `matchStart` and `matchEnd` columns to corpusQuery results, containing the start and end positions of the match in the text
7+
- Added `mergeDuplicateCollocates` function to merge collocation analysis results for different context positions
8+
- Added a query column to collocation analysis results
9+
- Improved documentation for span parameter in `collocationAnalysis` functions
10+
- Updated `textMetadata` method to use new metadata fields API, if available, to retrieve custom metadata for a text based on its sigle
11+
- Added new unit tests to cover the new features and changes
12+
313
## 0.8.1
414

515
- Updates recommended RKorAPClient version to 0.8.1

KorAPClient/__init__.py

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
from packaging import version
1616
from rpy2.robjects.methods import RS4
1717

18-
CURRENT_R_PACKAGE_VERSION = "0.8.1"
18+
CURRENT_R_PACKAGE_VERSION = "0.9.0"
1919

2020
KorAPClient = packages.importr('RKorAPClient')
2121
if version.parse(KorAPClient.__version__) < version.parse(CURRENT_R_PACKAGE_VERSION):
@@ -206,7 +206,7 @@ def collocationAnalysis(self, node, vc="", **kwargs):
206206
- **topCollocatesLimit** - limit analysis to the n most frequent collocates in the search hits sample
207207
- **searchHitsSampleLimit** - limit the size of the search hits sample
208208
- **ignoreCollocateCase** - bool, set to True if collocate case should be ignored
209-
- **withinSpan** - KorAP span specification for collocations to be searched within
209+
- **withinSpan** - KorAP span specification (see <https://korap.ids-mannheim.de/doc/ql/poliqarp-plus?embedded=true#spans>) for collocations to be searched within. Defaults to `base/s=s`
210210
- **exactFrequencies** - if False, extrapolate observed co-occurrence frequencies from frequencies in search hits sample, otherwise retrieve exact co-occurrence frequencies
211211
- **stopwords** - vector of stopwords not to be considered as collocates
212212
- **seed** - seed for random page collecting order
@@ -229,6 +229,11 @@ def collocationAnalysis(self, node, vc="", **kwargs):
229229
"""
230230
return KorAPClient.collocationAnalysis(self, node, vc, **kwargs)
231231

232+
def mergeDuplicateCollocates(self, *args, **kwargs):
233+
"""Merge collocation analysis results for different context positions."""
234+
return KorAPClient.mergeDuplicateCollocates(*args, **kwargs)
235+
236+
232237
def corpusQuery(self, *args, **kwargs):
233238
"""Query search term(s).
234239
@@ -237,7 +242,7 @@ def corpusQuery(self, *args, **kwargs):
237242
- **KorAPUrl** - instead of specifying the `query` and `vc` string parameters, you can copy your KorAP query URL here from the browser
238243
- **metadataOnly** - determines whether queries should return only metadata without any snippets. This can also be useful to prevent access rewrites. (default = True)
239244
- **ql** - query language: `"poliqarp" | "cosmas2" | "annis" | "cql" | "fcsql"` (default = `"poliqarp"`)
240-
- **fields** - (meta)data fields that will be fetched for every match (default = `["corpusSigle", "textSigle", "pubDate", "pubPlace", "availability", "textClass"]`)
245+
- **fields** - (meta)data fields that will be fetched for every match (default = `["corpusSigle", "textSigle", "pubDate", "pubPlace", "availability", "textClass", "matchStart", "matchEnd"]`)
241246
- **verbose** - (default = `self.verbose`)
242247
243248
Returns:

KorAPClient/tests/test_korapclient.py

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,52 @@ def test_textMetadata(self):
8989
self.assertIn('creationDate', df.columns)
9090
self.assertIn('pubPlace', df.columns)
9191
self.assertIn('author', df.columns)
92+
93+
def test_corpus_query_token_api(self):
94+
q = self.kcon.corpusQuery("focus([tt/p=ADJA] {Newstickeritis})", vc="corpusSigle=/W.D17/", metadataOnly=False)
95+
q = q.fetchNext()
96+
matches = q.slots['collectedMatches']
97+
98+
self.assertGreater(len(matches), 10)
99+
100+
unique_matches = matches['tokens.match'].unique()
101+
self.assertEqual(len(unique_matches), 1)
102+
self.assertEqual(unique_matches[0], "Newstickeritis")
103+
104+
left_contexts = matches['tokens.left']
105+
self.assertTrue(any('reine' in context for context in left_contexts))
106+
107+
right_contexts = matches['tokens.right']
108+
self.assertTrue(any('Begriff' in context for context in right_contexts))
109+
110+
def test_match_start_and_end(self):
111+
q = self.kcon.corpusQuery("focus([tt/p=ADJA] {Newstickeritis})", vc="corpusSigle=/W.D17/", metadataOnly=False)
112+
q = q.fetchNext()
113+
matches = q.slots['collectedMatches']
114+
115+
self.assertGreater(matches['matchEnd'].max(), 1000)
116+
self.assertTrue((matches['matchEnd'] == matches['matchStart']).all())
117+
118+
def test_extended_metadata_fields_ked(self):
119+
kcon_ked = KorAPConnection(KorAPUrl="https://korap.ids-mannheim.de/instance/ked", verbose=True)
120+
q = kcon_ked.corpusQuery(
121+
"einfache",
122+
fields=[
123+
"textSigle", "pubDate", "pubPlace", "availability", "textClass",
124+
"snippet", "tokens", "KED.cover1Herder", "KED.cover2Herder",
125+
"KED.cover3Herder", "KED.cover4Herder", "KED.cover5Herder",
126+
"KED.nPara", "KED.nPunct1kTks", "KED.nSent", "KED.nToks",
127+
"KED.nToksSentMd", "KED.nTyps", "KED.rcpnt", "KED.rcpntLabel",
128+
"KED.strtgy", "KED.strtgyLabel", "KED.topic", "KED.topicLabel",
129+
"KED.txttyp", "KED.txttypLabel"
130+
]
131+
).fetchAll()
132+
df = q.slots['collectedMatches']
133+
self.assertGreater(len(df), 0)
134+
self.assertGreater(min(df['KED.nToks'].astype(float)), 100)
135+
self.assertGreater(min(df['KED.nSent'].astype(float)), 8)
136+
self.assertGreater(min(df['KED.rcpnt'].str.len()), 5)
137+
92138

93139

94140
if __name__ == '__main__':

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[project]
22
name = "KorAPClient"
3-
version = "0.8.1"
3+
version = "0.9.0"
44
description = "Client package to access KorAP's web service API"
55
authors = [
66
{name = "Marc Kupietz",email = "[email protected]"},

0 commit comments

Comments
 (0)