Source code for faker_file.providers.augment_file_from_dir

import os
from pathlib import Path
from random import choice
from typing import (
    Any,
    Callable,
    Dict,
    Iterable,
    Optional,
    Type,
    Union,
    overload,
)

from faker.providers import BaseProvider

from ...base import BytesValue, FileMixin, StringValue
from ...helpers import load_class_from_path
from ...storages.base import BaseStorage
from ...storages.filesystem import FileSystemStorage
from ..base.text_augmenter import BaseTextAugmenter
from ..base.text_extractor import BaseTextExtractor
from ..helpers.inner import (
    create_inner_docx_file,
    create_inner_eml_file,
    create_inner_epub_file,
    create_inner_odt_file,
    create_inner_pdf_file,
    create_inner_rtf_file,
    create_inner_txt_file,
)

# from .augmenters.nlpaug_augmenter import ContextualWordEmbeddingsAugmenter
# from .extractors.tika_extractor import TikaTextExtractor

# Full list:
# create_inner_bin_file: N/A
# create_inner_bmp_file: Not supported
# create_inner_csv_file: Not supported
# create_inner_docx_file: Supported
# create_inner_eml_file: Supported
# create_inner_epub_file: Supported
# create_inner_generic_file: Not supported
# create_inner_gif_file: Not supported
# create_inner_ico_file: Not supported
# create_inner_jpeg_file: Not supported
# create_inner_json_file: Not supported
# create_inner_mp3_file: Not supported
# create_inner_ods_file: Not supported
# create_inner_odt_file: Supported
# create_inner_pdf_file: Supported
# create_inner_png_file: Not supported
# create_inner_pptx_file: Not supported
# create_inner_rtf_file: Supported
# create_inner_svg_file: Not supported
# create_inner_tar_file: Not supported
# create_inner_txt_file: Supported
# create_inner_webp_file: Not supported
# create_inner_xlsx_file: Not supported
# create_inner_xml_file: Not supported
# create_inner_zip_file: Not supported


__author__ = "Artur Barseghyan <artur.barseghyan@gmail.com>"
__copyright__ = "2022-2023 Artur Barseghyan"
__license__ = "MIT"
__all__ = (
    "AugmentFileFromDirProvider",
    "NLPAUG_AUGMENTER",
    "TEXTAUGMENT_AUGMENTER",
    "DEFAULT_AUGMENTER",
    "TIKA_EXTRACTOR",
    "DEFAULT_EXTRACTOR",
    "FILE_TYPE_TO_INNER_FUNC_MAPPING",
    "EXTENSIONS",
)

NLPAUG_AUGMENTER = (
    "faker_file.providers.augment_file_from_dir.augmenters.nlpaug_augmenter"
    ".ContextualWordEmbeddingsAugmenter"
)
TEXTAUGMENT_AUGMENTER = (
    "faker_file.providers.augment_file_from_dir.augmenters"
    ".textaugment_augmenter.EDATextaugmentAugmenter"
)
DEFAULT_AUGMENTER = TEXTAUGMENT_AUGMENTER

TIKA_EXTRACTOR = (
    "faker_file.providers.augment_file_from_dir.extractors.tika_extractor"
    ".TikaTextExtractor"
)
DEFAULT_EXTRACTOR = TIKA_EXTRACTOR

FILE_TYPE_TO_INNER_FUNC_MAPPING: Dict[str, Callable] = {
    "docx": create_inner_docx_file,
    "eml": create_inner_eml_file,
    "epub": create_inner_epub_file,
    "odt": create_inner_odt_file,
    "pdf": create_inner_pdf_file,
    "rtf": create_inner_rtf_file,
    "txt": create_inner_txt_file,
}

EXTENSIONS = set(FILE_TYPE_TO_INNER_FUNC_MAPPING.keys())


[docs]class AugmentFileFromDirProvider(BaseProvider, FileMixin): """Augment file from given directory provider. Usage example: .. code-block:: python from faker import Faker from faker_file.providers.augment_file_from_dir import ( AugmentFileFromDirProvider, ) FAKER = Faker() FAKER.add_provider(AugmentFileFromDirProvider) file = FAKER.augment_file_from_dir( source_dir_path="/tmp/tmp/", ) Usage example with options: .. code-block:: python file = FAKER.augment_file_from_dir( source_dir_path="/tmp/tmp/", prefix="zzz", extensions={"docx", "pdf"} ) """ extension: str = "" @overload def augment_file_from_dir( self: "AugmentFileFromDirProvider", source_dir_path: str, extensions: Optional[Iterable[str]] = None, storage: Optional[BaseStorage] = None, basename: Optional[str] = None, prefix: Optional[str] = None, wrap_chars_after: Optional[int] = None, text_extractor_cls: Optional[ Union[str, Type[BaseTextExtractor]] ] = DEFAULT_EXTRACTOR, text_extractor_kwargs: Optional[Dict[str, Any]] = None, text_augmenter_cls: Optional[ Union[str, Type[BaseTextAugmenter]] ] = DEFAULT_AUGMENTER, text_augmenter_kwargs: Optional[Dict[str, Any]] = None, raw: bool = True, **kwargs, ) -> BytesValue: ... @overload def augment_file_from_dir( self: "AugmentFileFromDirProvider", source_dir_path: str, extensions: Optional[Iterable[str]] = None, storage: Optional[BaseStorage] = None, basename: Optional[str] = None, prefix: Optional[str] = None, wrap_chars_after: Optional[int] = None, text_extractor_cls: Optional[ Union[str, Type[BaseTextExtractor]] ] = DEFAULT_EXTRACTOR, text_extractor_kwargs: Optional[Dict[str, Any]] = None, text_augmenter_cls: Optional[ Union[str, Type[BaseTextAugmenter]] ] = DEFAULT_AUGMENTER, text_augmenter_kwargs: Optional[Dict[str, Any]] = None, **kwargs, ) -> StringValue: ...
[docs] def augment_file_from_dir( self: "AugmentFileFromDirProvider", source_dir_path: str, extensions: Optional[Iterable[str]] = None, storage: Optional[BaseStorage] = None, basename: Optional[str] = None, prefix: Optional[str] = None, wrap_chars_after: Optional[int] = None, text_extractor_cls: Optional[ Union[str, Type[BaseTextExtractor]] ] = DEFAULT_EXTRACTOR, text_extractor_kwargs: Optional[Dict[str, Any]] = None, text_augmenter_cls: Optional[ Union[str, Type[BaseTextAugmenter]] ] = DEFAULT_AUGMENTER, text_augmenter_kwargs: Optional[Dict[str, Any]] = None, raw: bool = False, **kwargs, ) -> Union[BytesValue, StringValue]: """Augment a random file from given directory. :param source_dir_path: Source files directory. :param extensions: Allowed extensions. :param storage: Storage. Defaults to `FileSystemStorage`. :param basename: File basename (without extension). :param prefix: File name prefix. :param wrap_chars_after: If given, the output string would be separated by line breaks after the given position. :param text_extractor_cls: Text extractor class. :param text_extractor_kwargs: Text extractor kwargs. :param text_augmenter_cls: Text augmenter class. :param text_augmenter_kwargs: Text augmenter kwargs. :param raw: If set to True, return `BytesValue` (binary content of the file). Otherwise, return `StringValue` (path to the saved file). :return: Relative path (from root directory) of the generated file or raw content of the file. """ # Generic if storage is None: storage = FileSystemStorage() if extensions is None: extensions = EXTENSIONS # Specific source_file_choices = [ os.path.join(source_dir_path, _f) for _f in os.listdir(source_dir_path) if ( os.path.isfile(os.path.join(source_dir_path, _f)) and os.path.splitext(_f)[1][1:] in extensions ) ] source_file_path = choice(source_file_choices) source_file = Path(source_file_path) if text_extractor_cls is None: text_extractor_cls = DEFAULT_EXTRACTOR if isinstance(text_extractor_cls, str): text_extractor_cls = load_class_from_path(text_extractor_cls) if not text_extractor_kwargs: text_extractor_kwargs = {} text_extractor = text_extractor_cls(**text_extractor_kwargs) extracted_content = text_extractor.extract(source_file) file_type = source_file.suffix[1:] if text_augmenter_cls is None: text_augmenter_cls = DEFAULT_AUGMENTER if isinstance(text_augmenter_cls, str): text_augmenter_cls = load_class_from_path(text_augmenter_cls) if not text_augmenter_kwargs: text_augmenter_kwargs = {} text_augmenter = text_augmenter_cls(**text_augmenter_kwargs) content = text_augmenter.augment(extracted_content) return FILE_TYPE_TO_INNER_FUNC_MAPPING[file_type]( storage=storage, basename=basename, prefix=prefix, content=content, wrap_chars_after=wrap_chars_after, raw=raw, )