spam-detector/src/translate-dataset.py
2024-10-29 00:17:54 +03:00

75 lines
2.3 KiB
Python

# https://thepythoncode.com/article/translate-text-in-python
from os import close
from googletrans import Translator
import csv
from tqdm import tqdm
import argparse
import sys
import os.path
import shutil
csv.field_size_limit(sys.maxsize)
parser = argparse.ArgumentParser(prog='translate-dataset')
parser.add_argument('-i', '--input', help='Source file')
parser.add_argument('-o', '--output', help='Destination file')
args = parser.parse_args()
#/home/bvn13/develop/spam-detector-1/spam.csv
translator = Translator()
translation = translator.translate("Hola Mundo", dest="ru")
print(f"{translation.origin} ({translation.src}) --> {translation.text} ({translation.dest})")
total = 0
with open(args.input, "r") as f:
reader = csv.reader(f)
for row in reader:
total += 1
skip = 0
bup = None
if os.path.exists(args.output):
bup = f"{args.output}.bup"
shutil.copyfile(args.output, bup)
with open(args.output, "r") as f:
reader = csv.reader(f)
for row in reader:
skip += 1
progress = tqdm(total=total, unit='row', unit_scale=2)
n = 0
with open(args.input, "r") as f:
with open(args.output, "w") as tf:
bupf = None
bupcsv = None
if bup is not None:
bupf = open(bup, "r")
bupcsv = csv.reader(bupf)
next(bupcsv)
try:
reader = csv.reader(f)
progress.update(1)
ru = csv.writer(tf, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
ru.writerow(['label', 'text'])
header = next(reader)
skipped = 1
for row in reader:
progress.update(1)
decision = row[0]
text = row[1]
if skipped < skip:
skipped += 1
already_translated = next(bupcsv)
ru.writerow(already_translated)
else:
try:
translated_text = translator.translate(text, dest='ru')
ru.writerow([decision] + [translated_text.text])
except Exception as e:
print(f"Skipping line: {e}")
except Exception as e:
print(e)
finally:
if bupf is not None:
close(bupf)