import tensorflow as tf import numpy as np from CNN import CNNTextual import os import time import datetime import data_helpers # Model hyperparameters tf.flags.DEFINE_integer("embedding_dim", 300, "Dimensionality of word embedding") tf.flags.DEFINE_string("filter_sizes", "3,4,5", "Comma-seperated filter sizes")tf.flags.DEFINE_integer("num_filters", 128, "Number of filters per filter-size") tf.flags.DEFINE_float("keep_prob", 0.6, "Dropout probability") # Training parameters tf.flags.DEFINE_integer("batch_size", 32, "Batch Size for textual and technical indicators") tf.flags.DEFINE_interger("num_epochs", 200, "Number of training epochs") tf.flags.DEFINE_integer("evaluate_every", 10, "Print details every (100) steps") tf.flags.DEFINE_integer("checkpoint_every", 100, "Save model after every (100) steps") tf.flags.DEFINE_integer("num_checkpoints", 5, "Number of checkpoints to store") # Count the number of occurences of each word in a set def count_words(count_dict, text): for sentence in text: for word in sentence.split(): if word not in count_dict: count_dict[word] = 1 else: count_dict[word] += 1 # preprocessing text def clean_text(text, remove_stopwords = False): # Convert words to lower case text = text.lower() # Replace contractions with their longer forms if True: text = text.split() new_text = [] for word in text: if word in contractions: new_text.append(contractions[word]) else: new_text.append(word) text = " ".join(new_text) # Format words and remove unwanted characters text = re.sub(r'https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE) text = re.sub(r'\', ' ', text) text = re.sub(r'\'', ' ', text) # Optionally, remove stop words if remove_stopwords: text = text.split() stops = set(stopwords.words("english")) text = [w for w in text if not w in stops] text = " ".join(text) return text def vocab_to_int_dict(word_count): vocab_to_int = {} value = 0 for word, _ in word_count.items(): vocab_to_int[word] = value value += 1 vocab_to_int[""] = len(vocab_to_int) vocab_to_int[""] = len(vocab_to_int) return vocab_to_int def embedd_into_matrix(vocab_to_int, embeddings_index): embedding_dim = 300 word_embedding_matrix = np.zeros((len(vocab_to_int), embedding_dim), dtype=np.float32) for word, i in vocab_to_int.items(): if word in embeddings_index: word_embedding_matrix[i] = embeddings_index[word] else: # If word in not in Glove, create a random vector for the word new_embedding = np.array(np.random.uniform(-1.0, 1.0, embedding_dim), dtype=np.float32) word_embedding_matrix[i] = new_embedding if len(vocab_to_int) == len(word_embedding_matrix): print("All check.") return word_embedding_matrix # Convert word tokens into their integer representations def convert_to_ints(text, vocab_to_int): ints = [] for sentence in text: sentence_ints = [] for word in sentence.split(): if word in vocab_to_int: sentence_ints.append(vocab_to_int[word]) else: sentence_ints.append(vocab_to_int[""]) ints.append(sentence_ints) return ints def new_conv_layer(input, num_input_channels, filter_size, num_filters, use_pooling=True): shape = [filter_size, filter_size, num_input_channels, num_filters] weights = new_weights(shape) biases = new_biases(length=num_filters) if __name__ == "__main__": articles = [] with open("./texts,txt") as f: for line in f: articles.append(line) clean_articles = [] for article in articles: clean_articles.append(clean_text(article)) word_count = {} for line in clean_articles: for word in line: if word not in word_count: word_count[word] = 1 else: word_count[word] += 1 # import the glove embedding embeddings_index = {} with open("./data/glove.6B/glove.6B.300d.txt", encoding='utf-8') as f: for line in f: values = line.split() word = values[0] embedding = np.asarray(values[1:], dtype='float32') embeddings_index[word] = embedding vocab_to_int = {} vocab_to_int = vocab_to_int_dict(word_count) int_articles = convert_to_ints(clean_articles, vocab_to_int) word_embedding_matrix = embedd_into_matrix(vocab_to_int, embeddings_index) # find maximum summary length max_summary_length = 0 for line in int_articles: if len(line) > max_summary_length: max_summary_length = len(line) print("Max Summary Length: ", max_summary_length) padded_articles = [] # add padding to all articles for line in int_summaries: adding = [""]*(max_summary_length - len(line)) padded_articles.append(line.extend(adding)) # creating the basic structure of the model(v1) # placeholder for inputs and outputs x_texts = tf.placeholder(tf.int32, [None, None], name='input_text') x_numeric = tf.placeholder(tf.float32, [None, None], name='input_numeric') targets = tf.placeholder(tf.float32, [None], name='targets')