| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161 |
- import tensorflow as tf
- import numpy as np
- from CNN import CNNTextual
- import os
- import time
- import datetime
- import data_helpers
- # Model hyperparameters
- tf.flags.DEFINE_integer("embedding_dim", 300, "Dimensionality of word embedding")
- tf.flags.DEFINE_string("filter_sizes", "3,4,5", "Comma-seperated filter sizes")tf.flags.DEFINE_integer("num_filters", 128, "Number of filters per filter-size")
- tf.flags.DEFINE_float("keep_prob", 0.6, "Dropout probability")
- # Training parameters
- tf.flags.DEFINE_integer("batch_size", 32, "Batch Size for textual and technical indicators")
- tf.flags.DEFINE_interger("num_epochs", 200, "Number of training epochs")
- tf.flags.DEFINE_integer("evaluate_every", 10, "Print details every (100) steps")
- tf.flags.DEFINE_integer("checkpoint_every", 100, "Save model after every (100) steps")
- tf.flags.DEFINE_integer("num_checkpoints", 5, "Number of checkpoints to store")
- # Count the number of occurences of each word in a set
- def count_words(count_dict, text):
- for sentence in text:
- for word in sentence.split():
- if word not in count_dict:
- count_dict[word] = 1
- else:
- count_dict[word] += 1
- # preprocessing text
- def clean_text(text, remove_stopwords = False):
- # Convert words to lower case
- text = text.lower()
- # Replace contractions with their longer forms
- if True:
- text = text.split()
- new_text = []
- for word in text:
- if word in contractions:
- new_text.append(contractions[word])
- else:
- new_text.append(word)
- text = " ".join(new_text)
- # Format words and remove unwanted characters
- text = re.sub(r'https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
- text = re.sub(r'\<a href', ' ', text)
- text = re.sub(r'&', '', text)
- text = re.sub(r'[_"\-;%()|+&=*%.,!?:#$@\[\]/]', ' ', text)
- text = re.sub(r'<br />', ' ', text)
- text = re.sub(r'\'', ' ', text)
- # Optionally, remove stop words
- if remove_stopwords:
- text = text.split()
- stops = set(stopwords.words("english"))
- text = [w for w in text if not w in stops]
- text = " ".join(text)
- return text
- def vocab_to_int_dict(word_count):
- vocab_to_int = {}
- value = 0
- for word, _ in word_count.items():
- vocab_to_int[word] = value
- value += 1
- vocab_to_int["<UNK>"] = len(vocab_to_int)
- vocab_to_int["<PAD>"] = len(vocab_to_int)
- return vocab_to_int
- def embedd_into_matrix(vocab_to_int, embeddings_index):
- embedding_dim = 300
- word_embedding_matrix = np.zeros((len(vocab_to_int), embedding_dim), dtype=np.float32)
- for word, i in vocab_to_int.items():
- if word in embeddings_index:
- word_embedding_matrix[i] = embeddings_index[word]
- else:
- # If word in not in Glove, create a random vector for the word
- new_embedding = np.array(np.random.uniform(-1.0, 1.0, embedding_dim), dtype=np.float32)
- word_embedding_matrix[i] = new_embedding
- if len(vocab_to_int) == len(word_embedding_matrix):
- print("All check.")
- return word_embedding_matrix
- # Convert word tokens into their integer representations
- def convert_to_ints(text, vocab_to_int):
- ints = []
- for sentence in text:
- sentence_ints = []
- for word in sentence.split():
- if word in vocab_to_int:
- sentence_ints.append(vocab_to_int[word])
- else:
- sentence_ints.append(vocab_to_int["<UNK>"])
- ints.append(sentence_ints)
- return ints
- def new_conv_layer(input, num_input_channels, filter_size, num_filters, use_pooling=True):
- shape = [filter_size, filter_size, num_input_channels, num_filters]
- weights = new_weights(shape)
- biases = new_biases(length=num_filters)
- if __name__ == "__main__":
-
- articles = []
- with open("./texts,txt") as f:
- for line in f:
- articles.append(line)
- clean_articles = []
- for article in articles:
- clean_articles.append(clean_text(article))
- word_count = {}
- for line in clean_articles:
- for word in line:
- if word not in word_count:
- word_count[word] = 1
- else:
- word_count[word] += 1
- # import the glove embedding
- embeddings_index = {}
- with open("./data/glove.6B/glove.6B.300d.txt", encoding='utf-8') as f:
- for line in f:
- values = line.split()
- word = values[0]
- embedding = np.asarray(values[1:], dtype='float32')
- embeddings_index[word] = embedding
-
- vocab_to_int = {}
- vocab_to_int = vocab_to_int_dict(word_count)
- int_articles = convert_to_ints(clean_articles, vocab_to_int)
- word_embedding_matrix = embedd_into_matrix(vocab_to_int, embeddings_index)
-
- # find maximum summary length
- max_summary_length = 0
- for line in int_articles:
- if len(line) > max_summary_length:
- max_summary_length = len(line)
- print("Max Summary Length: ", max_summary_length)
-
- padded_articles = []
- # add padding to all articles
- for line in int_summaries:
- adding = ["<PAD>"]*(max_summary_length - len(line))
- padded_articles.append(line.extend(adding))
- # creating the basic structure of the model(v1)
- # placeholder for inputs and outputs
- x_texts = tf.placeholder(tf.int32, [None, None], name='input_text')
- x_numeric = tf.placeholder(tf.float32, [None, None], name='input_numeric')
- targets = tf.placeholder(tf.float32, [None], name='targets')
|