EP.py 9.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295
  1. import tensorflow as tf
  2. from tensorflow.python.ops.rnn_cell_impl import _zero_state_tensors
  3. from tensorflow.python.layers.core import Dense
  4. import numpy as np
  5. from nltk.corpus import stopwords
  6. import re
  7. contractions = {
  8. "ain't": "am not",
  9. "aren't": "are not",
  10. "can't": "cannot",
  11. "can't've": "cannot have",
  12. "'cause": "because",
  13. "could've": "could have",
  14. "couldn't": "could not",
  15. "couldn't've": "could not have",
  16. "didn't": "did not",
  17. "doesn't": "does not",
  18. "don't": "do not",
  19. "hadn't": "had not",
  20. "hadn't've": "had not have",
  21. "hasn't": "has not",
  22. "haven't": "have not",
  23. "he'd": "he would",
  24. "he'd've": "he would have",
  25. "he'll": "he will",
  26. "he's": "he is",
  27. "how'd": "how did",
  28. "how'll": "how will",
  29. "how's": "how is",
  30. "i'd": "i would",
  31. "i'll": "i will",
  32. "i'm": "i am",
  33. "i've": "i have",
  34. "isn't": "is not",
  35. "it'd": "it would",
  36. "it'll": "it will",
  37. "it's": "it is",
  38. "let's": "let us",
  39. "ma'am": "madam",
  40. "mayn't": "may not",
  41. "might've": "might have",
  42. "mightn't": "might not",
  43. "must've": "must have",
  44. "mustn't": "must not",
  45. "needn't": "need not",
  46. "oughtn't": "ought not",
  47. "shan't": "shall not",
  48. "sha'n't": "shall not",
  49. "she'd": "she would",
  50. "she'll": "she will",
  51. "she's": "she is",
  52. "should've": "should have",
  53. "shouldn't": "should not",
  54. "that'd": "that would",
  55. "that's": "that is",
  56. "there'd": "there had",
  57. "there's": "there is",
  58. "they'd": "they would",
  59. "they'll": "they will",
  60. "they're": "they are",
  61. "they've": "they have",
  62. "wasn't": "was not",
  63. "we'd": "we would",
  64. "we'll": "we will",
  65. "we're": "we are",
  66. "we've": "we have",
  67. "weren't": "were not",
  68. "what'll": "what will",
  69. "what're": "what are",
  70. "what's": "what is",
  71. "what've": "what have",
  72. "where'd": "where did",
  73. "where's": "where is",
  74. "who'll": "who will",
  75. "who's": "who is",
  76. "won't": "will not",
  77. "wouldn't": "would not",
  78. "you'd": "you would",
  79. "you'll": "you will",
  80. "you're": "you are"
  81. }
  82. # text cleaning
  83. def clean_text(text, remove_stopwords = False):
  84. # Convert words to lower case
  85. text = text.lower()
  86. # Replace contractions with their longer forms
  87. if True:
  88. text = text.split()
  89. new_text = []
  90. for word in text:
  91. if word in contractions:
  92. new_text.append(contractions[word])
  93. else:
  94. new_text.append(word)
  95. text = " ".join(new_text)
  96. # Format words and remove unwanted characters
  97. text = re.sub(r'https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
  98. text = re.sub(r'\<a href', ' ', text)
  99. text = re.sub(r'&amp;', '', text)
  100. text = re.sub(r'[_"\-;%()|+&=*%.,!?:#$@\[\]/]', ' ', text)
  101. text = re.sub(r'<br />', ' ', text)
  102. text = re.sub(r'\'', ' ', text)
  103. # Optionally, remove stop words
  104. if remove_stopwords:
  105. text = text.split()
  106. stops = set(stopwords.words("english"))
  107. text = [w for w in text if not w in stops]
  108. text = " ".join(text)
  109. return text
  110. def vocab_to_int_dict(word_count):
  111. vocab_to_int = {}
  112. value = 0
  113. for word, _ in word_count.items():
  114. vocab_to_int[word] = value
  115. value += 1
  116. vocab_to_int["<UNK>"] = len(vocab_to_int)
  117. vocab_to_int["<PAD>"] = len(vocab_to_int)
  118. return vocab_to_int
  119. def embedd_into_matrix(vocab_to_int, embeddings_index):
  120. embedding_dim = 300
  121. word_embedding_matrix = np.zeros((len(vocab_to_int), embedding_dim), dtype=np.float32)
  122. for word, i in vocab_to_int.items():
  123. if word in embeddings_index:
  124. word_embedding_matrix[i] = embeddings_index[word]
  125. else:
  126. # If word in not in Glove, create a random vector for the word
  127. new_embedding = np.array(np.random.uniform(-1.0, 1.0, embedding_dim), dtype=np.float32)
  128. word_embedding_matrix[i] = new_embedding
  129. if len(vocab_to_int) == len(word_embedding_matrix):
  130. print("All check.")
  131. return word_embedding_matrix
  132. # Convert word tokens into their integer representations
  133. def convert_to_ints(text, vocab_to_int):
  134. ints = []
  135. for sentence in text:
  136. sentence_ints = []
  137. for word in sentence.split():
  138. if word in vocab_to_int:
  139. sentence_ints.append(vocab_to_int[word])
  140. else:
  141. sentence_ints.append(vocab_to_int["<UNK>"])
  142. ints.append(sentence_ints)
  143. return ints
  144. # completes text transformation to converting words into integers
  145. def preprocess(clean_articles):
  146. # word count
  147. word_count = {}
  148. for line in clean_articles:
  149. for word in line.split():
  150. if word not in word_count:
  151. word_count[word] = 1
  152. else:
  153. word_count[word] += 1
  154. vocab_to_int = {}
  155. vocab_to_int = vocab_to_int_dict(word_count)
  156. int_articles = convert_to_ints(clean_articles, vocab_to_int)
  157. # find maximum summary length
  158. max_sequence_length = 0
  159. for line in int_articles:
  160. if len(line) > max_sequence_length:
  161. max_sequence_length = len(line)
  162. print("Max Summary Length: ", max_sequence_length)
  163. padded_articles = []
  164. # add padding to all articles
  165. for line in int_articles:
  166. adding = []
  167. adding = [vocab_to_int["<PAD>"]]*(max_sequence_length - len(line))
  168. padded_articles.append(line + adding)
  169. return padded_articles, max_sequence_length, vocab_to_int
  170. # importing data
  171. articles = []
  172. with open("./train.title.txt") as f:
  173. for line in f:
  174. articles.append(line)
  175. articles = articles[:10000]
  176. # import the glove embedding
  177. embeddings_index = {}
  178. with open("./glove.6B.300d.txt", encoding='utf-8') as f:
  179. for line in f:
  180. values = line.split()
  181. word = values[0]
  182. embedding = np.asarray(values[1:], dtype='float32')
  183. embeddings_index[word] = embedding
  184. clean_articles = []
  185. for article in articles:
  186. clean_articles.append(clean_text(article))
  187. padded_articles, max_sequence_length, vocab_to_int = preprocess(clean_articles)
  188. word_embedding_matrix = embedd_into_matrix(vocab_to_int, embeddings_index)
  189. embedding_size = 300
  190. num_filters = 128
  191. batch_size = 20
  192. cell_size = 128
  193. num_features = 7
  194. lstm_keep_prob = 0.6
  195. # Developing Graph
  196. # placeholders for inputs
  197. input_x = tf.placeholder(tf.int32, [None, max_sequence_length], name="input_x")
  198. input_num = tf.placeholder(tf.float32, [None, 100, num_features], name="input_num")
  199. keep_prob = tf.placeholder(tf.float32, name="keep_prob")
  200. targets = tf.placeholder(tf.float32, [batch_size, 1], name="targets")
  201. with tf.name_scope("embedding"):
  202. embed_input = tf.nn.embedding_lookup(word_embedding_matrix, input_x)
  203. embed_input_expanded = tf.expand_dims(embed_input, -1)
  204. # conv and maxpool layer for each size
  205. pooled_outputs = []
  206. filter_sizes = [3, 4, 5]
  207. for i, filter_size in enumerate(filter_sizes):
  208. with tf.name_scope("conv-maxpool-{}".format(str(filter_size))):
  209. # Conv layer
  210. filter_shape = [filter_size, embedding_size, 1, num_filters]
  211. W = tf.Variable(tf.truncated_normal(shape=filter_shape, stddev=0.1), name="W")
  212. b = tf.Variable(tf.constant(0.1, shape=[num_filters]), name="b")
  213. conv = tf.nn.conv2d(embed_input_expanded, W, strides=[1,1,1,1], padding="VALID", name="conv")
  214. # relu
  215. h = tf.nn.relu(tf.nn.bias_add(conv, b), name="relu")
  216. # max-pooling
  217. pooled = tf.nn.max_pool(h, ksize=[1, max_sequence_length-filter_size + 1, 1, 1], strides=[1, 1, 1, 1], padding="VALID", name="pool")
  218. pooled_outputs.append(pooled)
  219. # Combine all pooled outputs
  220. num_filters_total = num_filters*len(filter_sizes)
  221. hpool = tf.concat(pooled_outputs, 3)
  222. hpool_flat = tf.reshape(hpool, [-1, num_filters_total, 1])
  223. with tf.name_scope("dropout"):
  224. h_drop = tf.nn.dropout(hpool_flat, keep_prob)
  225. # TODO Recurrent Layer
  226. with tf.name_scope("rnn_textual") as rt:
  227. cell = tf.contrib.rnn.LSTMCell(cell_size, initializer=tf.random_uniform_initializer(-0.1, 0.1))
  228. cell = tf.contrib.rnn.DropoutWrapper(cell, input_keep_prob=lstm_keep_prob)
  229. initial_state = cell.zero_state(batch_size, dtype=tf.float32)
  230. rnn_outputs, rnn_states = tf.nn.dynamic_rnn(cell, hpool_flat, initial_state=initial_state, dtype=tf.float32, time_major=False)
  231. with tf.name_scope("rnn_numeric") as rn:
  232. cell = tf.contrib.rnn.LSTMCell(cell_size, initializer=tf.random_uniform_initializer(-0.1, 0.1))
  233. cell = tf.contrib.rnn.DropoutWrapper(cell, input_keep_prob=lstm_keep_prob)
  234. initial_state = cell.zero_state(batch_size, dtype=tf.float32)
  235. rnn_outputs_num, rnn_states_num = tf.nn.dynamic_rnn(cell, input_num, initial_state=initial_state, dtype=tf.float32, scope="rnn_numeric", time_major=False)
  236. dense_layer_input = tf.concat([rnn_outputs, rnn_outputs_num], 1)
  237. dense_layer_flat = tf.contrib.layers.flatten(dense_layer_input)
  238. dense_dropout = tf.nn.dropout(dense_layer_flat, keep_prob)
  239. final_output = tf.layers.dense(dense_dropout, 1, activation=tf.nn.relu, use_bias=True, kernel_initializer=tf.truncated_normal_initializer(mean = 0.0, stddev=0.1), trainable=True)
  240. print("Graph Done.")
  241. loss = tf.losses.mean_squared_error(final_output, targets)
  242. optimzer = tf.train.GradientDescentOptimizer(0.1).minimize(loss)
  243. with tf.Session() as sess:
  244. sess.run(tf.global_variables_initializer())
  245. # batch_size = 20
  246. batches_x = []
  247. print(final_output.shape)
  248. # print(rnn_outputs_num.shape)
  249. for i in range(int(len(padded_articles)/20)):
  250. batches_x.append(padded_articles[i * batch_size : (i+1) * batch_size])
  251. # loss, _ = sess.run([loss, optimzer], feed_dict={})
  252. # for i, batch_x in enumerate(batches_x):
  253. # # result = sess.run(hpool_flat, feed_dict={input_x:batch_x, keep_prob:0.6})
  254. # print("iteration: {} done.".format(str(i)))
  255. print("All done.")
  256. writer = tf.summary.FileWriter("./my_graph", sess.graph)