Source code for faker_file.providers.augment_file_from_dir.augmenters.faker_augmenter

import random

from faker import Faker

from ...base.text_augmenter import BaseTextAugmenter

__author__ = "Artur Barseghyan <artur.barseghyan@gmail.com>"
__copyright__ = "2022-2025 Artur Barseghyan"
__license__ = "MIT"
__all__ = (
    "FakerWordAugmenter",
    "DEFAULT_AUGMENTATION_PROBABILITY",
    "DEFAULT_POOL_SIZE",
)

DEFAULT_AUGMENTATION_PROBABILITY = 0.2
DEFAULT_POOL_SIZE = 1000


[docs] class FakerWordAugmenter(BaseTextAugmenter): """Text augmenter that randomly replaces words with Faker-generated words. Usage example: .. code-block:: python from faker import Faker from faker_file.providers.augment_file_from_dir import ( AugmentFileFromDirProvider, ) from faker_file.providers.augment_file_from_dir.augmenters import ( faker_augmenter, ) FAKER = Faker() FAKER.add_provider(AugmentFileFromDirProvider) file = FAKER.augment_file_from_dir( source_dir_path="/tmp/tmp/", text_augmenter_cls=faker_augmenter.FakerWordAugmenter, text_augmenter_kwargs={ "augmentation_probability": 0.3, "pool_size": 500, "locale": "en_US", } ) You may also pass an existing ``Faker`` instance via ``generator``, which is useful when you want the augmenter to share the same Faker instance as the rest of your application: .. code-block:: python from faker import Faker from faker_file.providers.augment_file_from_dir import ( AugmentFileFromDirProvider, ) from faker_file.providers.augment_file_from_dir.augmenters import ( faker_augmenter, ) MY_FAKER = Faker("de_DE") FAKER = Faker() FAKER.add_provider(AugmentFileFromDirProvider) file = FAKER.augment_file_from_dir( source_dir_path="/tmp/tmp/", text_augmenter_cls=faker_augmenter.FakerWordAugmenter, text_augmenter_kwargs={ "generator": MY_FAKER, "augmentation_probability": 0.3, "pool_size": 500, } ) Options: - ``augmentation_probability`` (float, default 0.2): probability that any given word will be replaced with a randomly chosen Faker word. - ``pool_size`` (int, default 1000): number of words pre-generated from Faker to draw replacements from. A larger pool increases variety; a smaller pool improves startup time. - ``locale`` (str, default ``None``): Faker locale used when building the word pool (e.g. ``"en_US"``, ``"de_DE"``). When ``None``, the default Faker locale is used. Ignored when ``generator`` is given. - ``generator`` (``Faker``, default ``None``): an existing Faker instance to use for building the word pool. When provided, a new Faker instance is not created and ``locale`` is ignored. """ augmentation_probability: float = DEFAULT_AUGMENTATION_PROBABILITY pool_size: int = DEFAULT_POOL_SIZE locale: str = None generator: Faker = None _word_pool: list = []
[docs] def handle_kwargs(self: "FakerWordAugmenter", **kwargs) -> None: """Handle kwargs.""" if "augmentation_probability" in kwargs: self.augmentation_probability = kwargs["augmentation_probability"] if "pool_size" in kwargs: self.pool_size = kwargs["pool_size"] if "locale" in kwargs: self.locale = kwargs["locale"] if "generator" in kwargs: self.generator = kwargs["generator"] self._build_word_pool()
def _build_word_pool(self: "FakerWordAugmenter") -> None: """Pre-generate a pool of words using Faker.""" if self.generator is not None: fake = self.generator elif self.locale: fake = Faker(self.locale) else: fake = Faker() self._word_pool = [fake.word() for _ in range(self.pool_size)]
[docs] def augment(self: "FakerWordAugmenter", text: str) -> str: """ Augment text by randomly replacing words with Faker-generated ones. :param text: Text to augment. :return: Augmented text. """ if not text: return text if not self._word_pool: self._build_word_pool() words = text.split() augmented_words = [ random.choice(self._word_pool) if random.random() < self.augmentation_probability else word for word in words ] return " ".join(augmented_words)