AI
/
MarketStudy


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161
							import tensorflow as tf
import numpy as np
from CNN import CNNTextual
import os
import time
import datetime
import data_helpers


# Model hyperparameters
tf.flags.DEFINE_integer("embedding_dim", 300, "Dimensionality of word embedding")
tf.flags.DEFINE_string("filter_sizes", "3,4,5", "Comma-seperated filter sizes")tf.flags.DEFINE_integer("num_filters", 128, "Number of filters per filter-size")
tf.flags.DEFINE_float("keep_prob", 0.6, "Dropout probability")

# Training parameters
tf.flags.DEFINE_integer("batch_size", 32, "Batch Size for textual and technical indicators")
tf.flags.DEFINE_interger("num_epochs", 200, "Number of training epochs")
tf.flags.DEFINE_integer("evaluate_every", 10, "Print details every (100) steps")
tf.flags.DEFINE_integer("checkpoint_every", 100, "Save model after every (100) steps")
tf.flags.DEFINE_integer("num_checkpoints", 5, "Number of checkpoints to store")

# Count the number of occurences of each word in a set
def count_words(count_dict, text):
    for sentence in text:
        for word in sentence.split():
            if word not in count_dict:
                count_dict[word] = 1
            else:
                count_dict[word] += 1

# preprocessing text
def clean_text(text, remove_stopwords = False):

    # Convert words to lower case
    text = text.lower()

    # Replace contractions with their longer forms
    if True:
        text = text.split()
        new_text = []
        for word in text:
            if word in contractions:
                new_text.append(contractions[word])
            else:
                new_text.append(word)
        text = " ".join(new_text)

    # Format words and remove unwanted characters
    text = re.sub(r'https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
    text = re.sub(r'\<a href', ' ', text)
    text = re.sub(r'&amp;', '', text)
    text = re.sub(r'[_"\-;%()|+&=*%.,!?:#$@\[\]/]', ' ', text)
    text = re.sub(r'<br />', ' ', text)
    text = re.sub(r'\'', ' ', text)

    # Optionally, remove stop words
    if remove_stopwords:
        text = text.split()
        stops = set(stopwords.words("english"))
        text = [w for w in text if not w in stops]
        text = " ".join(text)

        return text

def vocab_to_int_dict(word_count):
    vocab_to_int = {}
    value = 0

    for word, _ in word_count.items():
        vocab_to_int[word] = value
        value += 1
    vocab_to_int["<UNK>"] = len(vocab_to_int)
    vocab_to_int["<PAD>"] = len(vocab_to_int)

    return vocab_to_int

def embedd_into_matrix(vocab_to_int, embeddings_index):
    embedding_dim = 300
    word_embedding_matrix = np.zeros((len(vocab_to_int), embedding_dim), dtype=np.float32)
    for word, i in vocab_to_int.items():
        if word in embeddings_index:
            word_embedding_matrix[i] = embeddings_index[word]
        else:
            # If word in not in Glove, create a random vector for the word 
            new_embedding = np.array(np.random.uniform(-1.0, 1.0, embedding_dim), dtype=np.float32)
            word_embedding_matrix[i] = new_embedding
    if len(vocab_to_int) == len(word_embedding_matrix):
        print("All check.")

    return word_embedding_matrix

# Convert word tokens into their integer representations
def convert_to_ints(text, vocab_to_int):
    ints = []
    for sentence in text:
        sentence_ints = []
        for word in sentence.split():
            if word in vocab_to_int:
                sentence_ints.append(vocab_to_int[word])
            else:
                sentence_ints.append(vocab_to_int["<UNK>"])
        ints.append(sentence_ints)

    return ints

def new_conv_layer(input, num_input_channels, filter_size, num_filters, use_pooling=True):
    shape = [filter_size, filter_size, num_input_channels, num_filters]
    weights = new_weights(shape)
    biases = new_biases(length=num_filters)

if __name__ == "__main__":
    
    articles = []
    with open("./texts,txt") as f:
        for line in f:
            articles.append(line)

    clean_articles = []
    for article in articles:
        clean_articles.append(clean_text(article))

    word_count = {}
    for line in clean_articles:
        for word in line:
            if word not in word_count:
                word_count[word] = 1
            else:
                word_count[word] += 1
    # import the glove embedding
    embeddings_index = {}
    with open("./data/glove.6B/glove.6B.300d.txt", encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            embedding = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = embedding
    

    vocab_to_int = {}
    vocab_to_int = vocab_to_int_dict(word_count)
    int_articles = convert_to_ints(clean_articles, vocab_to_int)
    word_embedding_matrix = embedd_into_matrix(vocab_to_int, embeddings_index)
    
    # find maximum summary length
    max_summary_length = 0
    for line in int_articles:
        if len(line) > max_summary_length:
            max_summary_length = len(line)
    print("Max Summary Length: ", max_summary_length)
    
    padded_articles = []
    # add padding to all articles
    for line in int_summaries:
        adding = ["<PAD>"]*(max_summary_length - len(line))
        padded_articles.append(line.extend(adding))

    # creating the basic structure of the model(v1)
    # placeholder for inputs and outputs 
    x_texts = tf.placeholder(tf.int32, [None, None], name='input_text')
    x_numeric = tf.placeholder(tf.float32, [None, None], name='input_numeric')
    targets = tf.placeholder(tf.float32, [None], name='targets')