Commit 37b3aaed authored by Benjamin Murauer's avatar Benjamin Murauer
Browse files

allows prefixing for pathtransformer

parent 60847c1e
Pipeline #39338 passed with stage
in 2 minutes and 45 seconds
"""
These transformers can be used if filenames are used as documents.
"""
"""These transformers can be used if filenames are used as documents."""
import json
import os
......@@ -13,6 +11,7 @@ from sklearn.base import TransformerMixin
class FileReader(BaseEstimator, TransformerMixin):
"""
Transforms filenames into their content.
By default, this transformer will open files in text mode.
This is not a generator, so be careful with your memory.
......@@ -22,6 +21,8 @@ class FileReader(BaseEstimator, TransformerMixin):
def __init__(self, mode='text'):
"""
Reads files.
Args:
mode: The mode in which the files are to be opened. One of 'text',
'pickle' or 'json'.
......@@ -31,12 +32,14 @@ class FileReader(BaseEstimator, TransformerMixin):
raise ValueError(f'unknown mode {mode}, use one of: {valid_modes}')
self.mode = mode
def fit(self, X, y=None):
def fit(self, x, y=None):
"""Does nothing."""
return self
def transform(self, X):
def transform(self, x):
"""Transforms x by reading files."""
ret = []
for document in X:
for document in x:
if not os.path.isfile(document):
raise ValueError(f'No such file: {document}')
if self.mode == 'text':
......@@ -53,8 +56,7 @@ class FileReader(BaseEstimator, TransformerMixin):
class FileWriter(BaseEstimator, TransformerMixin):
"""
Stores elements to filenames, which must be provided at object creation
time.
Stores elements to filenames, which must be provided at creation.
input: list of serializable content
output: the same content. Storing the files is a side-effect.
......@@ -62,25 +64,28 @@ class FileWriter(BaseEstimator, TransformerMixin):
def __init__(self, filenames, writemode='text'):
"""
Writes files.
Args:
filenames: one valid path for each entry in X passed to fit()
writemode: mode in which to write the files. Can be 'text',
'pickle' or 'pickle'.
"""
self.filenames = filenames
valid_writemodes = ['text', 'json', 'pickle']
if writemode not in valid_writemodes:
raise ValueError(f'writemode must be one of {valid_writemodes}')
self.writemode = writemode
def fit(self, X, y=None):
def fit(self, x, y=None):
"""Does nothing."""
return self
def transform(self, X, y=None):
X = list(X)
assert len(X) == len(self.filenames)
for i, (filename, document) in enumerate(zip(self.filenames, X)):
def transform(self, x, y=None):
"""Transforms x by writing files."""
x = list(x)
assert len(x) == len(self.filenames)
for filename, document in zip(self.filenames, x):
if not os.path.isdir(os.path.dirname(filename)):
os.makedirs(os.path.dirname(filename), exist_ok=True)
if self.writemode == 'text':
......@@ -92,7 +97,7 @@ class FileWriter(BaseEstimator, TransformerMixin):
elif self.writemode == 'pickle':
with open(filename, 'wb') as o_f:
pickle.dump(document, o_f)
return X
return x
class PathTransformer(TransformerMixin, BaseEstimator):
......@@ -104,29 +109,38 @@ class PathTransformer(TransformerMixin, BaseEstimator):
the initialization parameters
"""
def __init__(self, path, extension=None):
def __init__(self, path, prefix_path=None, extension=None):
"""
Transforms paths.
Args:
path: The path which is appended to each filename
prefix_path: the path which is prepended to each filename
extension: The new extensions that is used, including the dot.
This replaces all old extensions: given a file foobar.txt.gz
and an extension .grammar will result in foobar.grammar.
"""
self.path = path
self.prefix_path = prefix_path
self.extension = extension
def fit(self, X, y=None):
def fit(self, x, y=None):
"""Does nothing."""
return self
def transform(self, X, y=None):
def transform(self, x, y=None):
"""Transforms paths in x."""
ret = []
for x in X:
base = os.path.dirname(x)
new_base = os.path.join(base, self.path)
name = os.path.splitext(os.path.basename(x))[0]
for x in x:
directory = os.path.dirname(x)
if self.path:
directory = os.path.join(directory, self.path)
if self.prefix_path:
directory = os.path.join(self.prefix_path, directory)
if self.extension:
ret.append(os.path.join(new_base, name + self.extension))
filename = os.path.splitext(os.path.basename(x))[0]
ret.append(os.path.join(directory, filename + self.extension))
else:
ret.append(os.path.join(new_base, os.path.basename(x)))
ret.append(os.path.join(directory, os.path.basename(x)))
return ret
"""Tests convenience transformers for file names."""
import tempfile
import unittest
......@@ -7,8 +8,10 @@ from dbispipeline.pipeline.filename import PathTransformer
class TestFileNames(unittest.TestCase):
"""Tests FileReader, Writer and PathTransformers."""
def test_text(self):
"""Should read text files."""
tmpfiles = [
tempfile.NamedTemporaryFile(mode='w', suffix='.txt', delete=False)
for x in range(10)
......@@ -22,6 +25,7 @@ class TestFileNames(unittest.TestCase):
self.assertEqual(contents, read_contents)
def test_json(self):
"""Should read json files."""
tmpfiles = [
tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False)
for x in range(10)
......@@ -35,6 +39,7 @@ class TestFileNames(unittest.TestCase):
self.assertEqual(contents, read_contents)
def test_pickle(self):
"""Should read pickle files."""
tmpfiles = [
tempfile.NamedTemporaryFile(mode='wb',
suffix='.pickle',
......@@ -49,6 +54,7 @@ class TestFileNames(unittest.TestCase):
self.assertEqual(contents, read_contents)
def test_path_renamer_noextension(self):
"""Should append directory to path."""
transformer = PathTransformer('grammar')
documents = ['/some/file/a.txt', '/some/file/b', '/some/file/c.blob']
expected = [
......@@ -60,6 +66,7 @@ class TestFileNames(unittest.TestCase):
self.assertEqual(actual, expected)
def test_path_renamer_extension(self):
"""Should change extension."""
transformer = PathTransformer('grammar', extension='.json')
documents = ['/some/file/a.txt', '/some/file/b', '/some/file/c.blob']
expected = [
......@@ -69,3 +76,17 @@ class TestFileNames(unittest.TestCase):
]
actual = transformer.transform(documents)
self.assertEqual(actual, expected)
def test_path_prefixed(self):
"""Should prepend directory to path."""
transformer = PathTransformer(path=None,
prefix_path='/very/fast/storage',
extension='.json')
documents = ['file/a.txt', 'file/b', 'file/c.blob']
expected = [
'/very/fast/storage/file/a.json',
'/very/fast/storage/file/b.json',
'/very/fast/storage/file/c.json',
]
actual = transformer.transform(documents)
self.assertEqual(actual, expected)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment