import tensorflow as tf from tensorflow.python.ops.rnn_cell_impl import _zero_state_tensors from tensorflow.python.layers.core import Dense import numpy as np from nltk.corpus import stopwords import re contractions = { "ain't": "am not", "aren't": "are not", "can't": "cannot", "can't've": "cannot have", "'cause": "because", "could've": "could have", "couldn't": "could not", "couldn't've": "could not have", "didn't": "did not", "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hadn't've": "had not have", "hasn't": "has not", "haven't": "have not", "he'd": "he would", "he'd've": "he would have", "he'll": "he will", "he's": "he is", "how'd": "how did", "how'll": "how will", "how's": "how is", "i'd": "i would", "i'll": "i will", "i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would", "it'll": "it will", "it's": "it is", "let's": "let us", "ma'am": "madam", "mayn't": "may not", "might've": "might have", "mightn't": "might not", "must've": "must have", "mustn't": "must not", "needn't": "need not", "oughtn't": "ought not", "shan't": "shall not", "sha'n't": "shall not", "she'd": "she would", "she'll": "she will", "she's": "she is", "should've": "should have", "shouldn't": "should not", "that'd": "that would", "that's": "that is", "there'd": "there had", "there's": "there is", "they'd": "they would", "they'll": "they will", "they're": "they are", "they've": "they have", "wasn't": "was not", "we'd": "we would", "we'll": "we will", "we're": "we are", "we've": "we have", "weren't": "were not", "what'll": "what will", "what're": "what are", "what's": "what is", "what've": "what have", "where'd": "where did", "where's": "where is", "who'll": "who will", "who's": "who is", "won't": "will not", "wouldn't": "would not", "you'd": "you would", "you'll": "you will", "you're": "you are" } # text cleaning def clean_text(text, remove_stopwords = False): # Convert words to lower case text = text.lower() # Replace contractions with their longer forms if True: text = text.split() new_text = [] for word in text: if word in contractions: new_text.append(contractions[word]) else: new_text.append(word) text = " ".join(new_text) # Format words and remove unwanted characters text = re.sub(r'https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE) text = re.sub(r'\', ' ', text) text = re.sub(r'\'', ' ', text) # Optionally, remove stop words if remove_stopwords: text = text.split() stops = set(stopwords.words("english")) text = [w for w in text if not w in stops] text = " ".join(text) return text def vocab_to_int_dict(word_count): vocab_to_int = {} value = 0 for word, _ in word_count.items(): vocab_to_int[word] = value value += 1 vocab_to_int[""] = len(vocab_to_int) vocab_to_int[""] = len(vocab_to_int) return vocab_to_int def embedd_into_matrix(vocab_to_int, embeddings_index): embedding_dim = 300 word_embedding_matrix = np.zeros((len(vocab_to_int), embedding_dim), dtype=np.float32) for word, i in vocab_to_int.items(): if word in embeddings_index: word_embedding_matrix[i] = embeddings_index[word] else: # If word in not in Glove, create a random vector for the word new_embedding = np.array(np.random.uniform(-1.0, 1.0, embedding_dim), dtype=np.float32) word_embedding_matrix[i] = new_embedding if len(vocab_to_int) == len(word_embedding_matrix): print("All check.") return word_embedding_matrix # Convert word tokens into their integer representations def convert_to_ints(text, vocab_to_int): ints = [] for sentence in text: sentence_ints = [] for word in sentence.split(): if word in vocab_to_int: sentence_ints.append(vocab_to_int[word]) else: sentence_ints.append(vocab_to_int[""]) ints.append(sentence_ints) return ints # completes text transformation to converting words into integers def preprocess(clean_articles): # word count word_count = {} for line in clean_articles: for word in line.split(): if word not in word_count: word_count[word] = 1 else: word_count[word] += 1 vocab_to_int = {} vocab_to_int = vocab_to_int_dict(word_count) int_articles = convert_to_ints(clean_articles, vocab_to_int) # find maximum summary length max_sequence_length = 0 for line in int_articles: if len(line) > max_sequence_length: max_sequence_length = len(line) print("Max Summary Length: ", max_sequence_length) padded_articles = [] # add padding to all articles for line in int_articles: adding = [] adding = [vocab_to_int[""]]*(max_sequence_length - len(line)) padded_articles.append(line + adding) return padded_articles, max_sequence_length, vocab_to_int # importing data articles = [] with open("./train.title.txt") as f: for line in f: articles.append(line) articles = articles[:10000] # import the glove embedding embeddings_index = {} with open("./glove.6B.300d.txt", encoding='utf-8') as f: for line in f: values = line.split() word = values[0] embedding = np.asarray(values[1:], dtype='float32') embeddings_index[word] = embedding clean_articles = [] for article in articles: clean_articles.append(clean_text(article)) padded_articles, max_sequence_length, vocab_to_int = preprocess(clean_articles) word_embedding_matrix = embedd_into_matrix(vocab_to_int, embeddings_index) embedding_size = 300 num_filters = 128 batch_size = 20 cell_size = 128 num_features = 7 lstm_keep_prob = 0.6 # Developing Graph # placeholders for inputs input_x = tf.placeholder(tf.int32, [None, max_sequence_length], name="input_x") input_num = tf.placeholder(tf.float32, [None, 100, num_features], name="input_num") keep_prob = tf.placeholder(tf.float32, name="keep_prob") targets = tf.placeholder(tf.float32, [batch_size, 1], name="targets") with tf.name_scope("embedding"): embed_input = tf.nn.embedding_lookup(word_embedding_matrix, input_x) embed_input_expanded = tf.expand_dims(embed_input, -1) # conv and maxpool layer for each size pooled_outputs = [] filter_sizes = [3, 4, 5] for i, filter_size in enumerate(filter_sizes): with tf.name_scope("conv-maxpool-{}".format(str(filter_size))): # Conv layer filter_shape = [filter_size, embedding_size, 1, num_filters] W = tf.Variable(tf.truncated_normal(shape=filter_shape, stddev=0.1), name="W") b = tf.Variable(tf.constant(0.1, shape=[num_filters]), name="b") conv = tf.nn.conv2d(embed_input_expanded, W, strides=[1,1,1,1], padding="VALID", name="conv") # relu h = tf.nn.relu(tf.nn.bias_add(conv, b), name="relu") # max-pooling pooled = tf.nn.max_pool(h, ksize=[1, max_sequence_length-filter_size + 1, 1, 1], strides=[1, 1, 1, 1], padding="VALID", name="pool") pooled_outputs.append(pooled) # Combine all pooled outputs num_filters_total = num_filters*len(filter_sizes) hpool = tf.concat(pooled_outputs, 3) hpool_flat = tf.reshape(hpool, [-1, num_filters_total, 1]) with tf.name_scope("dropout"): h_drop = tf.nn.dropout(hpool_flat, keep_prob) # TODO Recurrent Layer with tf.name_scope("rnn_textual") as rt: cell = tf.contrib.rnn.LSTMCell(cell_size, initializer=tf.random_uniform_initializer(-0.1, 0.1)) cell = tf.contrib.rnn.DropoutWrapper(cell, input_keep_prob=lstm_keep_prob) initial_state = cell.zero_state(batch_size, dtype=tf.float32) rnn_outputs, rnn_states = tf.nn.dynamic_rnn(cell, hpool_flat, initial_state=initial_state, dtype=tf.float32, time_major=False) with tf.name_scope("rnn_numeric") as rn: cell = tf.contrib.rnn.LSTMCell(cell_size, initializer=tf.random_uniform_initializer(-0.1, 0.1)) cell = tf.contrib.rnn.DropoutWrapper(cell, input_keep_prob=lstm_keep_prob) initial_state = cell.zero_state(batch_size, dtype=tf.float32) rnn_outputs_num, rnn_states_num = tf.nn.dynamic_rnn(cell, input_num, initial_state=initial_state, dtype=tf.float32, scope="rnn_numeric", time_major=False) dense_layer_input = tf.concat([rnn_outputs, rnn_outputs_num], 1) dense_layer_flat = tf.contrib.layers.flatten(dense_layer_input) dense_dropout = tf.nn.dropout(dense_layer_flat, keep_prob) final_output = tf.layers.dense(dense_dropout, 1, activation=tf.nn.relu, use_bias=True, kernel_initializer=tf.truncated_normal_initializer(mean = 0.0, stddev=0.1), trainable=True) print("Graph Done.") loss = tf.losses.mean_squared_error(final_output, targets) optimzer = tf.train.GradientDescentOptimizer(0.1).minimize(loss) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) # batch_size = 20 batches_x = [] print(final_output.shape) # print(rnn_outputs_num.shape) for i in range(int(len(padded_articles)/20)): batches_x.append(padded_articles[i * batch_size : (i+1) * batch_size]) # loss, _ = sess.run([loss, optimzer], feed_dict={}) # for i, batch_x in enumerate(batches_x): # # result = sess.run(hpool_flat, feed_dict={input_x:batch_x, keep_prob:0.6}) # print("iteration: {} done.".format(str(i))) print("All done.") writer = tf.summary.FileWriter("./my_graph", sess.graph)