forked from lbechberger/MLinPractice
-
Notifications
You must be signed in to change notification settings - Fork 2
/
extract_features.py
140 lines (124 loc) · 6.79 KB
/
extract_features.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Runs the specified collection of feature extractors.
@author: lbechberger
"""
import argparse, csv, pickle
import pandas as pd
import numpy as np
from code.feature_extraction.onehot_time_extraction import OneHotTimeExtractor
from code.feature_extraction.character_length import CharacterLength
from code.feature_extraction.get_count_boolean import AttributeBooleanCountGetter
from code.feature_extraction.tf_idf import TfIdf
from code.feature_extraction.threads import Threads
from code.feature_extraction.feature_collector import FeatureCollector
from code.feature_extraction.sentiment import Sentiment
from code.feature_extraction.ner import NER
from code.util import COLUMN_HASHTAGS, COLUMN_MENTIONS, COLUMN_PHOTOS, COLUMN_REPLY_TO, COLUMN_RETWEET_BOOL, COLUMN_TWEET, COLUMN_LABEL, COLUMN_URLS, COLUMN_VIDEO, COLUMN_UNTOKENIZED_TWEET, COLUMN_DATE, COLUMN_PREPROCESSED_TWEET, COLUMN_TIME
# setting up CLI
parser = argparse.ArgumentParser(description = "Feature Extraction")
# mandatory
parser.add_argument("input_file", help = "path to the input csv file")
parser.add_argument("output_file", help = "path to the output pickle file")
# optional
parser.add_argument("-e", "--export_file", help = "create a pipeline and export to the given location", default = None)
parser.add_argument("-i", "--import_file", help = "import an existing pipeline from the given location", default = None)
# features
parser.add_argument("-b", "--month", action = "store_true", help = "compute the month the tweet was posted")
parser.add_argument("-c", "--char_length", action = "store_true", help = "compute the number of characters in the tweet")
parser.add_argument("-d", "--daytime", action = "store_true", help = "compute the time of day the tweet was posted")
parser.add_argument("-n", "--ner", action = "store_true", help = "Apply Named-Entity-Recognition on the tweet")
parser.add_argument("-p", "--photos_count", action = "store_true", help = "compute the number of photos in the tweet")
parser.add_argument("-r", "--retweet_binary", action = "store_true", help = "compute the binary of if the tweet is a retweet")
parser.add_argument("-t", "--tfidf", action = "store_true", help = "compute word-wise tf-idf")
parser.add_argument("-u", "--url_count", action = "store_true", help = "compute the number of URLs used in the tweet")
parser.add_argument("-v", "--video_binary", action = "store_true", help = "compute the binary of if the tweet is a video")
parser.add_argument("-w", "--weekday", action = "store_true", help = "compute the day of the week the tweet was posted")
parser.add_argument("--hashtag_count", action = "store_true", help = "compute the number of hashtags in the tweet")
parser.add_argument("--item_count", action = "store_true", help = "compute the absolute count of items, else compute boolean if items > 0")
parser.add_argument("--mentions_count", action = "store_true", help = "compute the number of mentions in the tweet")
parser.add_argument("--reply_to_count", action = "store_true", help = "compute the number of accounts replied to in the tweet")
parser.add_argument("--season", action = "store_true", help = "compute the season the tweet was posted")
parser.add_argument("--sentiment", action = "store_true", help = "compute the tweet sentiment")
parser.add_argument("--threads", action = "store_true", help = "match tweets that are part of a thread")
args = parser.parse_args()
# load data
df = pd.read_csv(args.input_file, quoting = csv.QUOTE_NONNUMERIC, lineterminator = "\n")
if args.item_count:
count_type = "count"
else:
count_type = "boolean"
if args.import_file is not None:
# simply import an exisiting FeatureCollector
with open(args.import_file, "rb") as f_in:
feature_collector = pickle.load(f_in)
else: # need to create FeatureCollector manually
# collect all feature extractors
features = []
if args.char_length:
# character length of original tweet (without any changes)
features.append(CharacterLength(COLUMN_TWEET))
if args.ner:
# recognized named entity
features.append(NER(COLUMN_PREPROCESSED_TWEET))
if args.hashtag_count:
# number (or if) hashtags used
features.append(AttributeBooleanCountGetter(COLUMN_HASHTAGS, count_type))
if args.mentions_count:
# number (or if) mentions used
features.append(AttributeBooleanCountGetter(COLUMN_MENTIONS, count_type))
if args.reply_to_count:
# number (or if) reply_to used
features.append(AttributeBooleanCountGetter(COLUMN_REPLY_TO, count_type))
if args.photos_count:
# number (or if) photos used
features.append(AttributeBooleanCountGetter(COLUMN_PHOTOS, count_type))
if args.url_count:
# number (or if) URLs used
features.append(AttributeBooleanCountGetter(COLUMN_URLS, count_type))
if args.video_binary:
# convert if tweet contains video to boolean
features.append(AttributeBooleanCountGetter(COLUMN_VIDEO, "boolean"))
if args.retweet_binary:
# convert if tweet is retweet to boolean
features.append(AttributeBooleanCountGetter(COLUMN_RETWEET_BOOL, "boolean"))
if args.weekday:
# weekday of post
features.append(OneHotTimeExtractor(COLUMN_DATE, "weekday"))
if args.month:
# month of post
features.append(OneHotTimeExtractor(COLUMN_DATE, "month"))
if args.season:
# season of post (winter, spring, summer, fall)
features.append(OneHotTimeExtractor(COLUMN_DATE, "season"))
if args.daytime:
# daytime (0-6, 6-12, 12-18, 18-24) of post
features.append(OneHotTimeExtractor(COLUMN_TIME, "daytime"))
if args.tfidf:
features.append(TfIdf(COLUMN_PREPROCESSED_TWEET))
if args.sentiment:
# sentiment of original tweet (without any changes)
features.append(Sentiment(COLUMN_UNTOKENIZED_TWEET))
if args.threads:
# character length of original tweet (without any changes)
features.append(Threads(COLUMN_TWEET))
# create overall FeatureCollector
feature_collector = FeatureCollector(features)
# fit it on the given data set (assumed to be training data)
feature_collector.fit(df)
# apply the given FeatureCollector on the current data set
# maps the pandas DataFrame to an numpy array
feature_array = feature_collector.transform(df)
# get label array
label_array = np.array(df[COLUMN_LABEL])
label_array = label_array.reshape(-1, 1)
# store the results
results = {"features": feature_array, "labels": label_array,
"feature_names": feature_collector.get_feature_names()}
with open(args.output_file, 'wb') as f_out:
pickle.dump(results, f_out)
# export the FeatureCollector as pickle file if desired by user
if args.export_file is not None:
with open(args.export_file, 'wb') as f_out:
pickle.dump(feature_collector, f_out)