forked from lbechberger/MLinPractice
-
Notifications
You must be signed in to change notification settings - Fork 2
/
standardize.py
43 lines (32 loc) · 1.41 KB
/
standardize.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Preprocessor replaces UK spelling variations with their US equivalent to standardize spellings
@author: lmcdonald
"""
import re
from code.preprocessing.preprocessor import Preprocessor
from code.preprocessing.util.spellings import SPELLINGS_MAP
# substitutes UK spellings with US form
# credit: http://www.tysto.com/uk-us-spelling-list.html
class Standardizer(Preprocessor):
# constructor
def __init__(self, input_col, output):
# input column "tweet", new output column
super().__init__([input_col], output)
# get preprocessed column based on data frame and internal variables
def _get_values(self, inputs, spellings=SPELLINGS_MAP):
# appends uk spellings to RegEx string ("centre|theatre|...")
uk_spelling_pattern = re.compile('({})'.format('|'.join(spellings.keys())),
flags=re.IGNORECASE|re.DOTALL)
# replace spellings with their mapping
def standardize(uk_spelling):
match = uk_spelling.group(0)
if spellings.get(match):
us_spelling = spellings.get(match)
else:
us_spelling = spellings.get(match.lower())
return us_spelling
# change UK spelling to US spelling
column = inputs[0].str.replace(uk_spelling_pattern, standardize)
return column