42 lines
1.2 KiB
Python
42 lines
1.2 KiB
Python
|
# %%
|
||
|
import random
|
||
|
import string
|
||
|
|
||
|
def corrupt_word(word):
|
||
|
"""Corrupt a single word using random corruption techniques."""
|
||
|
if len(word) <= 1: # Skip corruption for single-character words
|
||
|
return word
|
||
|
|
||
|
corruption_type = random.choice(["delete", "swap"])
|
||
|
|
||
|
if corruption_type == "delete":
|
||
|
# Randomly delete a character
|
||
|
idx = random.randint(0, len(word) - 1)
|
||
|
word = word[:idx] + word[idx + 1:]
|
||
|
|
||
|
elif corruption_type == "swap":
|
||
|
# Swap two adjacent characters
|
||
|
if len(word) > 1:
|
||
|
idx = random.randint(0, len(word) - 2)
|
||
|
word = (word[:idx] + word[idx + 1] + word[idx] + word[idx + 2:])
|
||
|
|
||
|
|
||
|
return word
|
||
|
|
||
|
def corrupt_string(sentence, corruption_probability=0.01):
|
||
|
"""Corrupt each word in the string with a given probability."""
|
||
|
words = sentence.split()
|
||
|
corrupted_words = [
|
||
|
corrupt_word(word) if random.random() < corruption_probability else word
|
||
|
for word in words
|
||
|
]
|
||
|
return " ".join(corrupted_words)
|
||
|
|
||
|
# Example usage
|
||
|
sentence = "This is a simple string for testing"
|
||
|
corrupted_sentence = corrupt_string(sentence, corruption_probability=0.1)
|
||
|
print("Original:", sentence)
|
||
|
print("Corrupted:", corrupted_sentence)
|
||
|
|
||
|
# %%
|