|
|
@@ -0,0 +1,295 @@
|
|
|
+import tensorflow as tf
|
|
|
+from tensorflow.python.ops.rnn_cell_impl import _zero_state_tensors
|
|
|
+from tensorflow.python.layers.core import Dense
|
|
|
+import numpy as np
|
|
|
+
|
|
|
+from nltk.corpus import stopwords
|
|
|
+import re
|
|
|
+
|
|
|
+contractions = {
|
|
|
+"ain't": "am not",
|
|
|
+"aren't": "are not",
|
|
|
+"can't": "cannot",
|
|
|
+"can't've": "cannot have",
|
|
|
+"'cause": "because",
|
|
|
+"could've": "could have",
|
|
|
+"couldn't": "could not",
|
|
|
+"couldn't've": "could not have",
|
|
|
+"didn't": "did not",
|
|
|
+"doesn't": "does not",
|
|
|
+"don't": "do not",
|
|
|
+"hadn't": "had not",
|
|
|
+"hadn't've": "had not have",
|
|
|
+"hasn't": "has not",
|
|
|
+"haven't": "have not",
|
|
|
+"he'd": "he would",
|
|
|
+"he'd've": "he would have",
|
|
|
+"he'll": "he will",
|
|
|
+"he's": "he is",
|
|
|
+"how'd": "how did",
|
|
|
+"how'll": "how will",
|
|
|
+"how's": "how is",
|
|
|
+"i'd": "i would",
|
|
|
+"i'll": "i will",
|
|
|
+"i'm": "i am",
|
|
|
+"i've": "i have",
|
|
|
+"isn't": "is not",
|
|
|
+"it'd": "it would",
|
|
|
+"it'll": "it will",
|
|
|
+"it's": "it is",
|
|
|
+"let's": "let us",
|
|
|
+"ma'am": "madam",
|
|
|
+"mayn't": "may not",
|
|
|
+"might've": "might have",
|
|
|
+"mightn't": "might not",
|
|
|
+"must've": "must have",
|
|
|
+"mustn't": "must not",
|
|
|
+"needn't": "need not",
|
|
|
+"oughtn't": "ought not",
|
|
|
+"shan't": "shall not",
|
|
|
+"sha'n't": "shall not",
|
|
|
+"she'd": "she would",
|
|
|
+"she'll": "she will",
|
|
|
+"she's": "she is",
|
|
|
+"should've": "should have",
|
|
|
+"shouldn't": "should not",
|
|
|
+"that'd": "that would",
|
|
|
+"that's": "that is",
|
|
|
+"there'd": "there had",
|
|
|
+"there's": "there is",
|
|
|
+"they'd": "they would",
|
|
|
+"they'll": "they will",
|
|
|
+"they're": "they are",
|
|
|
+"they've": "they have",
|
|
|
+"wasn't": "was not",
|
|
|
+"we'd": "we would",
|
|
|
+"we'll": "we will",
|
|
|
+"we're": "we are",
|
|
|
+"we've": "we have",
|
|
|
+"weren't": "were not",
|
|
|
+"what'll": "what will",
|
|
|
+"what're": "what are",
|
|
|
+"what's": "what is",
|
|
|
+"what've": "what have",
|
|
|
+"where'd": "where did",
|
|
|
+"where's": "where is",
|
|
|
+"who'll": "who will",
|
|
|
+"who's": "who is",
|
|
|
+"won't": "will not",
|
|
|
+"wouldn't": "would not",
|
|
|
+"you'd": "you would",
|
|
|
+"you'll": "you will",
|
|
|
+"you're": "you are"
|
|
|
+}
|
|
|
+
|
|
|
+
|
|
|
+# text cleaning
|
|
|
+def clean_text(text, remove_stopwords = False):
|
|
|
+
|
|
|
+ # Convert words to lower case
|
|
|
+ text = text.lower()
|
|
|
+
|
|
|
+ # Replace contractions with their longer forms
|
|
|
+ if True:
|
|
|
+ text = text.split()
|
|
|
+ new_text = []
|
|
|
+ for word in text:
|
|
|
+ if word in contractions:
|
|
|
+ new_text.append(contractions[word])
|
|
|
+ else:
|
|
|
+ new_text.append(word)
|
|
|
+ text = " ".join(new_text)
|
|
|
+
|
|
|
+ # Format words and remove unwanted characters
|
|
|
+ text = re.sub(r'https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
|
|
|
+ text = re.sub(r'\<a href', ' ', text)
|
|
|
+ text = re.sub(r'&', '', text)
|
|
|
+ text = re.sub(r'[_"\-;%()|+&=*%.,!?:#$@\[\]/]', ' ', text)
|
|
|
+ text = re.sub(r'<br />', ' ', text)
|
|
|
+ text = re.sub(r'\'', ' ', text)
|
|
|
+
|
|
|
+ # Optionally, remove stop words
|
|
|
+ if remove_stopwords:
|
|
|
+ text = text.split()
|
|
|
+ stops = set(stopwords.words("english"))
|
|
|
+ text = [w for w in text if not w in stops]
|
|
|
+ text = " ".join(text)
|
|
|
+
|
|
|
+ return text
|
|
|
+
|
|
|
+def vocab_to_int_dict(word_count):
|
|
|
+ vocab_to_int = {}
|
|
|
+ value = 0
|
|
|
+
|
|
|
+ for word, _ in word_count.items():
|
|
|
+ vocab_to_int[word] = value
|
|
|
+ value += 1
|
|
|
+ vocab_to_int["<UNK>"] = len(vocab_to_int)
|
|
|
+ vocab_to_int["<PAD>"] = len(vocab_to_int)
|
|
|
+
|
|
|
+ return vocab_to_int
|
|
|
+
|
|
|
+def embedd_into_matrix(vocab_to_int, embeddings_index):
|
|
|
+ embedding_dim = 300
|
|
|
+ word_embedding_matrix = np.zeros((len(vocab_to_int), embedding_dim), dtype=np.float32)
|
|
|
+ for word, i in vocab_to_int.items():
|
|
|
+ if word in embeddings_index:
|
|
|
+ word_embedding_matrix[i] = embeddings_index[word]
|
|
|
+ else:
|
|
|
+ # If word in not in Glove, create a random vector for the word
|
|
|
+ new_embedding = np.array(np.random.uniform(-1.0, 1.0, embedding_dim), dtype=np.float32)
|
|
|
+ word_embedding_matrix[i] = new_embedding
|
|
|
+ if len(vocab_to_int) == len(word_embedding_matrix):
|
|
|
+ print("All check.")
|
|
|
+
|
|
|
+ return word_embedding_matrix
|
|
|
+
|
|
|
+# Convert word tokens into their integer representations
|
|
|
+def convert_to_ints(text, vocab_to_int):
|
|
|
+ ints = []
|
|
|
+ for sentence in text:
|
|
|
+ sentence_ints = []
|
|
|
+ for word in sentence.split():
|
|
|
+ if word in vocab_to_int:
|
|
|
+ sentence_ints.append(vocab_to_int[word])
|
|
|
+ else:
|
|
|
+ sentence_ints.append(vocab_to_int["<UNK>"])
|
|
|
+ ints.append(sentence_ints)
|
|
|
+
|
|
|
+ return ints
|
|
|
+
|
|
|
+# completes text transformation to converting words into integers
|
|
|
+def preprocess(clean_articles):
|
|
|
+ # word count
|
|
|
+ word_count = {}
|
|
|
+ for line in clean_articles:
|
|
|
+ for word in line.split():
|
|
|
+ if word not in word_count:
|
|
|
+ word_count[word] = 1
|
|
|
+ else:
|
|
|
+ word_count[word] += 1
|
|
|
+
|
|
|
+ vocab_to_int = {}
|
|
|
+ vocab_to_int = vocab_to_int_dict(word_count)
|
|
|
+ int_articles = convert_to_ints(clean_articles, vocab_to_int)
|
|
|
+
|
|
|
+ # find maximum summary length
|
|
|
+ max_sequence_length = 0
|
|
|
+ for line in int_articles:
|
|
|
+ if len(line) > max_sequence_length:
|
|
|
+ max_sequence_length = len(line)
|
|
|
+ print("Max Summary Length: ", max_sequence_length)
|
|
|
+ padded_articles = []
|
|
|
+ # add padding to all articles
|
|
|
+ for line in int_articles:
|
|
|
+ adding = []
|
|
|
+ adding = [vocab_to_int["<PAD>"]]*(max_sequence_length - len(line))
|
|
|
+ padded_articles.append(line + adding)
|
|
|
+
|
|
|
+ return padded_articles, max_sequence_length, vocab_to_int
|
|
|
+
|
|
|
+# importing data
|
|
|
+articles = []
|
|
|
+with open("./train.title.txt") as f:
|
|
|
+ for line in f:
|
|
|
+ articles.append(line)
|
|
|
+
|
|
|
+articles = articles[:10000]
|
|
|
+
|
|
|
+# import the glove embedding
|
|
|
+embeddings_index = {}
|
|
|
+with open("./glove.6B.300d.txt", encoding='utf-8') as f:
|
|
|
+ for line in f:
|
|
|
+ values = line.split()
|
|
|
+ word = values[0]
|
|
|
+ embedding = np.asarray(values[1:], dtype='float32')
|
|
|
+ embeddings_index[word] = embedding
|
|
|
+
|
|
|
+clean_articles = []
|
|
|
+for article in articles:
|
|
|
+ clean_articles.append(clean_text(article))
|
|
|
+
|
|
|
+padded_articles, max_sequence_length, vocab_to_int = preprocess(clean_articles)
|
|
|
+word_embedding_matrix = embedd_into_matrix(vocab_to_int, embeddings_index)
|
|
|
+
|
|
|
+embedding_size = 300
|
|
|
+num_filters = 128
|
|
|
+batch_size = 20
|
|
|
+cell_size = 128
|
|
|
+num_features = 7
|
|
|
+lstm_keep_prob = 0.6
|
|
|
+
|
|
|
+# Developing Graph
|
|
|
+# placeholders for inputs
|
|
|
+input_x = tf.placeholder(tf.int32, [None, max_sequence_length], name="input_x")
|
|
|
+input_num = tf.placeholder(tf.float32, [None, 100, num_features], name="input_num")
|
|
|
+keep_prob = tf.placeholder(tf.float32, name="keep_prob")
|
|
|
+targets = tf.placeholder(tf.float32, [batch_size, 1], name="targets")
|
|
|
+
|
|
|
+
|
|
|
+with tf.name_scope("embedding"):
|
|
|
+ embed_input = tf.nn.embedding_lookup(word_embedding_matrix, input_x)
|
|
|
+ embed_input_expanded = tf.expand_dims(embed_input, -1)
|
|
|
+
|
|
|
+# conv and maxpool layer for each size
|
|
|
+pooled_outputs = []
|
|
|
+filter_sizes = [3, 4, 5]
|
|
|
+
|
|
|
+for i, filter_size in enumerate(filter_sizes):
|
|
|
+ with tf.name_scope("conv-maxpool-{}".format(str(filter_size))):
|
|
|
+ # Conv layer
|
|
|
+ filter_shape = [filter_size, embedding_size, 1, num_filters]
|
|
|
+ W = tf.Variable(tf.truncated_normal(shape=filter_shape, stddev=0.1), name="W")
|
|
|
+ b = tf.Variable(tf.constant(0.1, shape=[num_filters]), name="b")
|
|
|
+ conv = tf.nn.conv2d(embed_input_expanded, W, strides=[1,1,1,1], padding="VALID", name="conv")
|
|
|
+ # relu
|
|
|
+ h = tf.nn.relu(tf.nn.bias_add(conv, b), name="relu")
|
|
|
+ # max-pooling
|
|
|
+ pooled = tf.nn.max_pool(h, ksize=[1, max_sequence_length-filter_size + 1, 1, 1], strides=[1, 1, 1, 1], padding="VALID", name="pool")
|
|
|
+ pooled_outputs.append(pooled)
|
|
|
+
|
|
|
+# Combine all pooled outputs
|
|
|
+num_filters_total = num_filters*len(filter_sizes)
|
|
|
+hpool = tf.concat(pooled_outputs, 3)
|
|
|
+hpool_flat = tf.reshape(hpool, [-1, num_filters_total, 1])
|
|
|
+
|
|
|
+with tf.name_scope("dropout"):
|
|
|
+ h_drop = tf.nn.dropout(hpool_flat, keep_prob)
|
|
|
+
|
|
|
+# TODO Recurrent Layer
|
|
|
+with tf.name_scope("rnn_textual") as rt:
|
|
|
+ cell = tf.contrib.rnn.LSTMCell(cell_size, initializer=tf.random_uniform_initializer(-0.1, 0.1))
|
|
|
+ cell = tf.contrib.rnn.DropoutWrapper(cell, input_keep_prob=lstm_keep_prob)
|
|
|
+ initial_state = cell.zero_state(batch_size, dtype=tf.float32)
|
|
|
+ rnn_outputs, rnn_states = tf.nn.dynamic_rnn(cell, hpool_flat, initial_state=initial_state, dtype=tf.float32, time_major=False)
|
|
|
+
|
|
|
+with tf.name_scope("rnn_numeric") as rn:
|
|
|
+ cell = tf.contrib.rnn.LSTMCell(cell_size, initializer=tf.random_uniform_initializer(-0.1, 0.1))
|
|
|
+ cell = tf.contrib.rnn.DropoutWrapper(cell, input_keep_prob=lstm_keep_prob)
|
|
|
+ initial_state = cell.zero_state(batch_size, dtype=tf.float32)
|
|
|
+ rnn_outputs_num, rnn_states_num = tf.nn.dynamic_rnn(cell, input_num, initial_state=initial_state, dtype=tf.float32, scope="rnn_numeric", time_major=False)
|
|
|
+
|
|
|
+dense_layer_input = tf.concat([rnn_outputs, rnn_outputs_num], 1)
|
|
|
+dense_layer_flat = tf.contrib.layers.flatten(dense_layer_input)
|
|
|
+dense_dropout = tf.nn.dropout(dense_layer_flat, keep_prob)
|
|
|
+final_output = tf.layers.dense(dense_dropout, 1, activation=tf.nn.relu, use_bias=True, kernel_initializer=tf.truncated_normal_initializer(mean = 0.0, stddev=0.1), trainable=True)
|
|
|
+print("Graph Done.")
|
|
|
+
|
|
|
+loss = tf.losses.mean_squared_error(final_output, targets)
|
|
|
+optimzer = tf.train.GradientDescentOptimizer(0.1).minimize(loss)
|
|
|
+
|
|
|
+with tf.Session() as sess:
|
|
|
+ sess.run(tf.global_variables_initializer())
|
|
|
+ # batch_size = 20
|
|
|
+ batches_x = []
|
|
|
+ print(final_output.shape)
|
|
|
+ # print(rnn_outputs_num.shape)
|
|
|
+ for i in range(int(len(padded_articles)/20)):
|
|
|
+ batches_x.append(padded_articles[i * batch_size : (i+1) * batch_size])
|
|
|
+ # loss, _ = sess.run([loss, optimzer], feed_dict={})
|
|
|
+ # for i, batch_x in enumerate(batches_x):
|
|
|
+ # # result = sess.run(hpool_flat, feed_dict={input_x:batch_x, keep_prob:0.6})
|
|
|
+ # print("iteration: {} done.".format(str(i)))
|
|
|
+ print("All done.")
|
|
|
+
|
|
|
+ writer = tf.summary.FileWriter("./my_graph", sess.graph)
|