forked from lbechberger/MLinPractice
-
Notifications
You must be signed in to change notification settings - Fork 2
/
run_preprocessing.py
106 lines (92 loc) · 5.57 KB
/
run_preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Runs the specified collection of preprocessing steps
Created on Tue Sep 28 16:43:18 2021
@author: lbechberger
"""
import argparse, csv, pickle
import pandas as pd
from sklearn.pipeline import make_pipeline
from code.preprocessing.punctuation_remover import PunctuationRemover
from code.preprocessing.tokenizer import Tokenizer
from code.preprocessing.lowercase import Lowercase
from code.preprocessing.standardize import Standardizer
from code.preprocessing.expand import Expander
from code.preprocessing.prune_languages import LanguagePruner
from code.preprocessing.regex_replacer import RegexReplacer
from code.preprocessing.lemmatizer import Lemmatizer
from code.preprocessing.stopword_remover import StopwordRemover
from code.util import SUFFIX_PUNCTUATION, SUFFIX_STANDARDIZED, SUFFIX_TOKENIZED, SUFFIX_LOWERCASED, SUFFIX_URLS_REMOVED, SUFFIX_NUMBERS_REPLACED, TOKEN_NUMBER, SUFFIX_CONTRACTIONS, SUFFIX_LEMMATIZED, SUFFIX_REMOVED_STOPWORDS
# setting up CLI
parser = argparse.ArgumentParser(description = "Various preprocessing steps")
# mandatory
parser.add_argument("input_file", help = "path to the input csv file")
parser.add_argument("output_file", help = "path to the output csv file")
# optional
parser.add_argument("-e", "--export_file", help = "create a pipeline and export to the given location", default = None)
parser.add_argument("--fast", type = int, help = "only run preprocessing on a subset of the data set. Specify subset size in int, e.g. --fast 100")
# preprocessors
parser.add_argument("-l", "--prune_lang", action="store_true")
parser.add_argument("--pipeline", action='append', nargs='*', help="define a preprocessing pipeline e.g. --pipeline "
"<column> preprocessor1 preprocessor 2 ... "
"IMPORTANT: remove_urls has to run before punctuation"
"Available preprocessors in the correct order of application: "
"remove_urls, lowercase, expand, punctuation, standardize, "
"tokenize, numbers, lemmatize, remove_stopwords")
args = parser.parse_args()
# load data
df = pd.read_csv(args.input_file, quoting = csv.QUOTE_NONNUMERIC, lineterminator = "\n")
if args.fast:
df = df.drop(labels = range(args.fast, df.shape[0]), axis = 0)
# Removes rows in a language other than the one specified to keep
if args.prune_lang:
language_pruner = LanguagePruner(df)
language_pruner.get_language_counts()
df = language_pruner.drop_rows_by_language(language = "en")
# collect all preprocessors
preprocessors = []
if args.pipeline:
for pipeline in args.pipeline:
current_column = ''
for preprocessor in pipeline:
if preprocessor == 'remove_urls':
preprocessors.append(RegexReplacer(current_column, current_column + SUFFIX_URLS_REMOVED, r'https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)', ""))
current_column = current_column + SUFFIX_URLS_REMOVED
elif preprocessor == 'lowercase':
preprocessors.append(Lowercase(current_column, current_column + SUFFIX_LOWERCASED))
current_column = current_column + SUFFIX_LOWERCASED
elif preprocessor == 'expand':
preprocessors.append(Expander(current_column, current_column + SUFFIX_CONTRACTIONS,))
current_column = current_column + SUFFIX_CONTRACTIONS
elif preprocessor == 'punctuation':
preprocessors.append(PunctuationRemover(current_column, current_column + SUFFIX_PUNCTUATION))
current_column = current_column + SUFFIX_PUNCTUATION
elif preprocessor == 'standardize':
preprocessors.append(Standardizer(current_column, current_column + SUFFIX_STANDARDIZED))
current_column = current_column + SUFFIX_STANDARDIZED
elif preprocessor == 'tokenize':
preprocessors.append(Tokenizer(current_column, current_column + SUFFIX_TOKENIZED))
current_column = current_column + SUFFIX_TOKENIZED
elif preprocessor == 'numbers':
preprocessors.append(RegexReplacer(current_column, current_column + SUFFIX_NUMBERS_REPLACED, r'(?<=\W)\d+(?=\W)|^\d+(?=\W)|(?<=\W)\d+$', TOKEN_NUMBER))
current_column = current_column + SUFFIX_NUMBERS_REPLACED
elif preprocessor == 'lemmatize':
preprocessors.append(Lemmatizer(current_column, current_column+SUFFIX_LEMMATIZED))
current_column = current_column + SUFFIX_LEMMATIZED
elif preprocessor == 'remove_stopwords':
preprocessors.append(StopwordRemover(current_column, current_column+SUFFIX_REMOVED_STOPWORDS))
current_column = current_column + SUFFIX_REMOVED_STOPWORDS
else:
# first argument in pipeline is column
current_column = preprocessor
# call all preprocessing steps
for preprocessor in preprocessors:
df = preprocessor.fit_transform(df)
# store the results
df.to_csv(args.output_file, index = False, quoting = csv.QUOTE_NONNUMERIC, line_terminator = "\n")
# create a pipeline if necessary and store it as pickle file
if args.export_file is not None:
pipeline = make_pipeline(*preprocessors)
with open(args.export_file, 'wb') as f_out:
pickle.dump(pipeline, f_out)