Skip to content
Snippets Groups Projects
Commit e12bd853 authored by Benjamin Murauer's avatar Benjamin Murauer
Browse files

release v0.4.11: added tests for StanzaToNlpFieldTransformer, changed output...

release v0.4.11: added tests for StanzaToNlpFieldTransformer, changed output to nested list instead of strings.
parent 375b3ee9
No related branches found
No related tags found
No related merge requests found
Pipeline #51213 failed
[tool.poetry]
name = "tuhlbox"
version = "0.4.10"
version = "0.4.11"
homepage = "https://git.uibk.ac.at/csak8736/tuhlbox"
description = "Personal toolbox of language processing models."
authors = ["Benjamin Murauer <b.murauer@posteo.de>"]
......
from typing import List
from tuhlbox.stanza import StanzaNlpToFieldTransformer, StanzaParserTransformer
sentences: List
def setup_function() -> None:
global sentences
documents = [
"I have been trying to reach you. This is another sentence in this document.",
"This is a second document, independent from the others.",
]
parser = StanzaParserTransformer("en", silent=True)
sentences = parser.transform(documents)
def test_stanza_nlp_to_field_transformer_xpos() -> None:
transformer = StanzaNlpToFieldTransformer("xpos")
expected = [
[ # document 1
["PRP", "VBP", "VBN", "VBG", "TO", "VB", "PRP", "."], # sentence 1
["DT", "VBZ", "DT", "NN", "IN", "DT", "NN", "."], # sentence 2
],
[ # document 2 (one sentence)
["DT", "VBZ", "DT", "JJ", "NN", ",", "JJ", "IN", "DT", "NNS", "."],
],
]
actual = transformer.transform(sentences)
assert expected == actual
def test_stanza_nlp_to_field_transformer_upos() -> None:
transformer = StanzaNlpToFieldTransformer("upos")
expected = [
[ # document 1
["PRON", "AUX", "AUX", "VERB", "PART", "VERB", "PRON", "PUNCT"],
["PRON", "AUX", "DET", "NOUN", "ADP", "DET", "NOUN", "PUNCT"],
],
[ # document 2
[
"PRON",
"AUX",
"DET",
"ADJ",
"NOUN",
"PUNCT",
"ADJ",
"ADP",
"DET",
"NOUN",
"PUNCT",
]
],
]
actual = transformer.transform(sentences)
assert expected == actual
......@@ -106,11 +106,11 @@ class StanzaNlpToFieldTransformer(BaseEstimator, TransformerMixin):
"""
Flattens a stanford document in the same order as the parsed text.
Input (Document | Sentence):
each document is expected to be a StanfordNLP document or sentence.
Output: each document is returned as a single string, where each word is
represented as the field that is to be extracted, separated by spaces.
Sentences are separated by newlines.
Input document:
each document is expected to be a StanfordNLP document.
Output document: each document is returned as a list of sentences; and each sentence
is a list of tags (=strings)
"""
def __init__(self, field: str):
......@@ -128,11 +128,11 @@ class StanzaNlpToFieldTransformer(BaseEstimator, TransformerMixin):
def transform(
self, x: Iterable[Union[str, Document, Sentence]], _y: Any = None
) -> List[str]:
) -> List[List[List[str]]]:
"""Transform documents."""
result = []
result: List[List[List[str]]] = []
for document in x:
document_result: List[str] = []
document_result: List[List[str]] = []
sentences: List[Sentence] = []
if isinstance(document, Sentence):
sentences = [document]
......@@ -145,8 +145,8 @@ class StanzaNlpToFieldTransformer(BaseEstimator, TransformerMixin):
if label is None:
raise ValueError(f"label is None for word: {word}")
new_sentence.append(label)
document_result.append(" ".join(new_sentence))
result.append("\n".join(document_result))
document_result.append(new_sentence)
result.append(document_result)
return result
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment