import tensorflow as tf
import numpy as np
from CNN import CNNTextual
import os
import time
import datetime
import data_helpers
# Model hyperparameters
tf.flags.DEFINE_integer("embedding_dim", 300, "Dimensionality of word embedding")
tf.flags.DEFINE_string("filter_sizes", "3,4,5", "Comma-seperated filter sizes")tf.flags.DEFINE_integer("num_filters", 128, "Number of filters per filter-size")
tf.flags.DEFINE_float("keep_prob", 0.6, "Dropout probability")
# Training parameters
tf.flags.DEFINE_integer("batch_size", 32, "Batch Size for textual and technical indicators")
tf.flags.DEFINE_interger("num_epochs", 200, "Number of training epochs")
tf.flags.DEFINE_integer("evaluate_every", 10, "Print details every (100) steps")
tf.flags.DEFINE_integer("checkpoint_every", 100, "Save model after every (100) steps")
tf.flags.DEFINE_integer("num_checkpoints", 5, "Number of checkpoints to store")
# Count the number of occurences of each word in a set
def count_words(count_dict, text):
for sentence in text:
for word in sentence.split():
if word not in count_dict:
count_dict[word] = 1
else:
count_dict[word] += 1
# preprocessing text
def clean_text(text, remove_stopwords = False):
# Convert words to lower case
text = text.lower()
# Replace contractions with their longer forms
if True:
text = text.split()
new_text = []
for word in text:
if word in contractions:
new_text.append(contractions[word])
else:
new_text.append(word)
text = " ".join(new_text)
# Format words and remove unwanted characters
text = re.sub(r'https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
text = re.sub(r'\', ' ', text)
text = re.sub(r'\'', ' ', text)
# Optionally, remove stop words
if remove_stopwords:
text = text.split()
stops = set(stopwords.words("english"))
text = [w for w in text if not w in stops]
text = " ".join(text)
return text
def vocab_to_int_dict(word_count):
vocab_to_int = {}
value = 0
for word, _ in word_count.items():
vocab_to_int[word] = value
value += 1
vocab_to_int[""] = len(vocab_to_int)
vocab_to_int[""] = len(vocab_to_int)
return vocab_to_int
def embedd_into_matrix(vocab_to_int, embeddings_index):
embedding_dim = 300
word_embedding_matrix = np.zeros((len(vocab_to_int), embedding_dim), dtype=np.float32)
for word, i in vocab_to_int.items():
if word in embeddings_index:
word_embedding_matrix[i] = embeddings_index[word]
else:
# If word in not in Glove, create a random vector for the word
new_embedding = np.array(np.random.uniform(-1.0, 1.0, embedding_dim), dtype=np.float32)
word_embedding_matrix[i] = new_embedding
if len(vocab_to_int) == len(word_embedding_matrix):
print("All check.")
return word_embedding_matrix
# Convert word tokens into their integer representations
def convert_to_ints(text, vocab_to_int):
ints = []
for sentence in text:
sentence_ints = []
for word in sentence.split():
if word in vocab_to_int:
sentence_ints.append(vocab_to_int[word])
else:
sentence_ints.append(vocab_to_int[""])
ints.append(sentence_ints)
return ints
def new_conv_layer(input, num_input_channels, filter_size, num_filters, use_pooling=True):
shape = [filter_size, filter_size, num_input_channels, num_filters]
weights = new_weights(shape)
biases = new_biases(length=num_filters)
if __name__ == "__main__":
articles = []
with open("./texts,txt") as f:
for line in f:
articles.append(line)
clean_articles = []
for article in articles:
clean_articles.append(clean_text(article))
word_count = {}
for line in clean_articles:
for word in line:
if word not in word_count:
word_count[word] = 1
else:
word_count[word] += 1
# import the glove embedding
embeddings_index = {}
with open("./data/glove.6B/glove.6B.300d.txt", encoding='utf-8') as f:
for line in f:
values = line.split()
word = values[0]
embedding = np.asarray(values[1:], dtype='float32')
embeddings_index[word] = embedding
vocab_to_int = {}
vocab_to_int = vocab_to_int_dict(word_count)
int_articles = convert_to_ints(clean_articles, vocab_to_int)
word_embedding_matrix = embedd_into_matrix(vocab_to_int, embeddings_index)
# find maximum summary length
max_summary_length = 0
for line in int_articles:
if len(line) > max_summary_length:
max_summary_length = len(line)
print("Max Summary Length: ", max_summary_length)
padded_articles = []
# add padding to all articles
for line in int_summaries:
adding = [""]*(max_summary_length - len(line))
padded_articles.append(line.extend(adding))
# creating the basic structure of the model(v1)
# placeholder for inputs and outputs
x_texts = tf.placeholder(tf.int32, [None, None], name='input_text')
x_numeric = tf.placeholder(tf.float32, [None, None], name='input_numeric')
targets = tf.placeholder(tf.float32, [None], name='targets')