AI
/
MarketStudy


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295
							import tensorflow as tf
from tensorflow.python.ops.rnn_cell_impl import _zero_state_tensors
from tensorflow.python.layers.core import Dense
import numpy as np

from nltk.corpus import stopwords
import re

contractions = {
"ain't": "am not",
"aren't": "are not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he would",
"he'd've": "he would have",
"he'll": "he will",
"he's": "he is",
"how'd": "how did",
"how'll": "how will",
"how's": "how is",
"i'd": "i would",
"i'll": "i will",
"i'm": "i am",
"i've": "i have",
"isn't": "is not",
"it'd": "it would",
"it'll": "it will",
"it's": "it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"must've": "must have",
"mustn't": "must not",
"needn't": "need not",
"oughtn't": "ought not",
"shan't": "shall not",
"sha'n't": "shall not",
"she'd": "she would",
"she'll": "she will",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"that'd": "that would",
"that's": "that is",
"there'd": "there had",
"there's": "there is",
"they'd": "they would",
"they'll": "they will",
"they're": "they are",
"they've": "they have",
"wasn't": "was not",
"we'd": "we would",
"we'll": "we will",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what will",
"what're": "what are",
"what's": "what is",
"what've": "what have",
"where'd": "where did",
"where's": "where is",
"who'll": "who will",
"who's": "who is",
"won't": "will not",
"wouldn't": "would not",
"you'd": "you would",
"you'll": "you will",
"you're": "you are"
}


# text cleaning
def clean_text(text, remove_stopwords = False):

    # Convert words to lower case
    text = text.lower()

    # Replace contractions with their longer forms
    if True:
        text = text.split()
        new_text = []
        for word in text:
            if word in contractions:
                new_text.append(contractions[word])
            else:
                new_text.append(word)
        text = " ".join(new_text)

    # Format words and remove unwanted characters
    text = re.sub(r'https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
    text = re.sub(r'\<a href', ' ', text)
    text = re.sub(r'&amp;', '', text)
    text = re.sub(r'[_"\-;%()|+&=*%.,!?:#$@\[\]/]', ' ', text)
    text = re.sub(r'<br />', ' ', text)
    text = re.sub(r'\'', ' ', text)

    # Optionally, remove stop words
    if remove_stopwords:
        text = text.split()
        stops = set(stopwords.words("english"))
        text = [w for w in text if not w in stops]
        text = " ".join(text)

    return text

def vocab_to_int_dict(word_count):
    vocab_to_int = {}
    value = 0

    for word, _ in word_count.items():
        vocab_to_int[word] = value
        value += 1
    vocab_to_int["<UNK>"] = len(vocab_to_int)
    vocab_to_int["<PAD>"] = len(vocab_to_int)

    return vocab_to_int

def embedd_into_matrix(vocab_to_int, embeddings_index):
    embedding_dim = 300
    word_embedding_matrix = np.zeros((len(vocab_to_int), embedding_dim), dtype=np.float32)
    for word, i in vocab_to_int.items():
        if word in embeddings_index:
            word_embedding_matrix[i] = embeddings_index[word]
        else:
            # If word in not in Glove, create a random vector for the word
            new_embedding = np.array(np.random.uniform(-1.0, 1.0, embedding_dim), dtype=np.float32)
            word_embedding_matrix[i] = new_embedding
    if len(vocab_to_int) == len(word_embedding_matrix):
        print("All check.")

    return word_embedding_matrix

# Convert word tokens into their integer representations
def convert_to_ints(text, vocab_to_int):
    ints = []
    for sentence in text:
        sentence_ints = []
        for word in sentence.split():
            if word in vocab_to_int:
                sentence_ints.append(vocab_to_int[word])
            else:
                sentence_ints.append(vocab_to_int["<UNK>"])
        ints.append(sentence_ints)

    return ints

# completes text transformation to converting words into integers
def preprocess(clean_articles):
    # word count
    word_count = {}
    for line in clean_articles:
        for word in line.split():
            if word not in word_count:
                word_count[word] = 1
            else:
                word_count[word] += 1

    vocab_to_int = {}
    vocab_to_int = vocab_to_int_dict(word_count)
    int_articles = convert_to_ints(clean_articles, vocab_to_int)

    # find maximum summary length
    max_sequence_length = 0
    for line in int_articles:
        if len(line) > max_sequence_length:
            max_sequence_length = len(line)
    print("Max Summary Length: ", max_sequence_length)
    padded_articles = []
    # add padding to all articles
    for line in int_articles:
        adding = []
        adding = [vocab_to_int["<PAD>"]]*(max_sequence_length - len(line))
        padded_articles.append(line + adding)

    return padded_articles, max_sequence_length, vocab_to_int

# importing data
articles = []
with open("./train.title.txt") as f:
    for line in f:
        articles.append(line)

articles = articles[:10000]

# import the glove embedding
embeddings_index = {}
with open("./glove.6B.300d.txt", encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        embedding = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = embedding

clean_articles = []
for article in articles:
    clean_articles.append(clean_text(article))

padded_articles, max_sequence_length, vocab_to_int = preprocess(clean_articles)
word_embedding_matrix = embedd_into_matrix(vocab_to_int, embeddings_index)

embedding_size = 300
num_filters = 128
batch_size = 20
cell_size = 128
num_features = 7
lstm_keep_prob = 0.6

# Developing Graph
# placeholders for inputs
input_x = tf.placeholder(tf.int32, [None, max_sequence_length], name="input_x")
input_num = tf.placeholder(tf.float32, [None, 100, num_features], name="input_num")
keep_prob = tf.placeholder(tf.float32, name="keep_prob")
targets = tf.placeholder(tf.float32, [batch_size, 1], name="targets")


with tf.name_scope("embedding"):
    embed_input = tf.nn.embedding_lookup(word_embedding_matrix, input_x)
    embed_input_expanded = tf.expand_dims(embed_input, -1)

# conv and maxpool layer for each size
pooled_outputs = []
filter_sizes = [3, 4, 5]

for i, filter_size in enumerate(filter_sizes):
    with tf.name_scope("conv-maxpool-{}".format(str(filter_size))):
        # Conv layer
        filter_shape = [filter_size, embedding_size, 1, num_filters]
        W = tf.Variable(tf.truncated_normal(shape=filter_shape, stddev=0.1), name="W")
        b = tf.Variable(tf.constant(0.1, shape=[num_filters]), name="b")
        conv = tf.nn.conv2d(embed_input_expanded, W, strides=[1,1,1,1], padding="VALID", name="conv")
        # relu
        h = tf.nn.relu(tf.nn.bias_add(conv, b), name="relu")
        # max-pooling
        pooled = tf.nn.max_pool(h, ksize=[1, max_sequence_length-filter_size + 1, 1, 1], strides=[1, 1, 1, 1], padding="VALID", name="pool")
        pooled_outputs.append(pooled)

# Combine all pooled outputs
num_filters_total = num_filters*len(filter_sizes)
hpool = tf.concat(pooled_outputs, 3)
hpool_flat = tf.reshape(hpool, [-1, num_filters_total, 1])

with tf.name_scope("dropout"):
    h_drop = tf.nn.dropout(hpool_flat, keep_prob)

# TODO Recurrent Layer
with tf.name_scope("rnn_textual") as rt:
    cell = tf.contrib.rnn.LSTMCell(cell_size, initializer=tf.random_uniform_initializer(-0.1, 0.1))
    cell = tf.contrib.rnn.DropoutWrapper(cell, input_keep_prob=lstm_keep_prob)
    initial_state = cell.zero_state(batch_size, dtype=tf.float32)
    rnn_outputs, rnn_states = tf.nn.dynamic_rnn(cell, hpool_flat, initial_state=initial_state, dtype=tf.float32, time_major=False)

with tf.name_scope("rnn_numeric") as rn:
    cell = tf.contrib.rnn.LSTMCell(cell_size, initializer=tf.random_uniform_initializer(-0.1, 0.1))
    cell = tf.contrib.rnn.DropoutWrapper(cell, input_keep_prob=lstm_keep_prob)
    initial_state = cell.zero_state(batch_size, dtype=tf.float32)
    rnn_outputs_num, rnn_states_num = tf.nn.dynamic_rnn(cell, input_num, initial_state=initial_state, dtype=tf.float32, scope="rnn_numeric", time_major=False)

dense_layer_input = tf.concat([rnn_outputs, rnn_outputs_num], 1)
dense_layer_flat = tf.contrib.layers.flatten(dense_layer_input)
dense_dropout = tf.nn.dropout(dense_layer_flat, keep_prob)
final_output = tf.layers.dense(dense_dropout, 1, activation=tf.nn.relu, use_bias=True, kernel_initializer=tf.truncated_normal_initializer(mean = 0.0, stddev=0.1), trainable=True)
print("Graph Done.")

loss = tf.losses.mean_squared_error(final_output, targets)
optimzer = tf.train.GradientDescentOptimizer(0.1).minimize(loss)

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    # batch_size = 20
    batches_x = []
    print(final_output.shape)
    # print(rnn_outputs_num.shape)
    for i in range(int(len(padded_articles)/20)):
        batches_x.append(padded_articles[i * batch_size : (i+1) * batch_size])
        # loss, _ = sess.run([loss, optimzer], feed_dict={})
    # for i, batch_x in enumerate(batches_x):
    #     # result = sess.run(hpool_flat, feed_dict={input_x:batch_x, keep_prob:0.6})
    #     print("iteration: {} done.".format(str(i)))
    print("All done.")

    writer = tf.summary.FileWriter("./my_graph", sess.graph)