model.py 5.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161
  1. import tensorflow as tf
  2. import numpy as np
  3. from CNN import CNNTextual
  4. import os
  5. import time
  6. import datetime
  7. import data_helpers
  8. # Model hyperparameters
  9. tf.flags.DEFINE_integer("embedding_dim", 300, "Dimensionality of word embedding")
  10. tf.flags.DEFINE_string("filter_sizes", "3,4,5", "Comma-seperated filter sizes")tf.flags.DEFINE_integer("num_filters", 128, "Number of filters per filter-size")
  11. tf.flags.DEFINE_float("keep_prob", 0.6, "Dropout probability")
  12. # Training parameters
  13. tf.flags.DEFINE_integer("batch_size", 32, "Batch Size for textual and technical indicators")
  14. tf.flags.DEFINE_interger("num_epochs", 200, "Number of training epochs")
  15. tf.flags.DEFINE_integer("evaluate_every", 10, "Print details every (100) steps")
  16. tf.flags.DEFINE_integer("checkpoint_every", 100, "Save model after every (100) steps")
  17. tf.flags.DEFINE_integer("num_checkpoints", 5, "Number of checkpoints to store")
  18. # Count the number of occurences of each word in a set
  19. def count_words(count_dict, text):
  20. for sentence in text:
  21. for word in sentence.split():
  22. if word not in count_dict:
  23. count_dict[word] = 1
  24. else:
  25. count_dict[word] += 1
  26. # preprocessing text
  27. def clean_text(text, remove_stopwords = False):
  28. # Convert words to lower case
  29. text = text.lower()
  30. # Replace contractions with their longer forms
  31. if True:
  32. text = text.split()
  33. new_text = []
  34. for word in text:
  35. if word in contractions:
  36. new_text.append(contractions[word])
  37. else:
  38. new_text.append(word)
  39. text = " ".join(new_text)
  40. # Format words and remove unwanted characters
  41. text = re.sub(r'https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
  42. text = re.sub(r'\<a href', ' ', text)
  43. text = re.sub(r'&amp;', '', text)
  44. text = re.sub(r'[_"\-;%()|+&=*%.,!?:#$@\[\]/]', ' ', text)
  45. text = re.sub(r'<br />', ' ', text)
  46. text = re.sub(r'\'', ' ', text)
  47. # Optionally, remove stop words
  48. if remove_stopwords:
  49. text = text.split()
  50. stops = set(stopwords.words("english"))
  51. text = [w for w in text if not w in stops]
  52. text = " ".join(text)
  53. return text
  54. def vocab_to_int_dict(word_count):
  55. vocab_to_int = {}
  56. value = 0
  57. for word, _ in word_count.items():
  58. vocab_to_int[word] = value
  59. value += 1
  60. vocab_to_int["<UNK>"] = len(vocab_to_int)
  61. vocab_to_int["<PAD>"] = len(vocab_to_int)
  62. return vocab_to_int
  63. def embedd_into_matrix(vocab_to_int, embeddings_index):
  64. embedding_dim = 300
  65. word_embedding_matrix = np.zeros((len(vocab_to_int), embedding_dim), dtype=np.float32)
  66. for word, i in vocab_to_int.items():
  67. if word in embeddings_index:
  68. word_embedding_matrix[i] = embeddings_index[word]
  69. else:
  70. # If word in not in Glove, create a random vector for the word
  71. new_embedding = np.array(np.random.uniform(-1.0, 1.0, embedding_dim), dtype=np.float32)
  72. word_embedding_matrix[i] = new_embedding
  73. if len(vocab_to_int) == len(word_embedding_matrix):
  74. print("All check.")
  75. return word_embedding_matrix
  76. # Convert word tokens into their integer representations
  77. def convert_to_ints(text, vocab_to_int):
  78. ints = []
  79. for sentence in text:
  80. sentence_ints = []
  81. for word in sentence.split():
  82. if word in vocab_to_int:
  83. sentence_ints.append(vocab_to_int[word])
  84. else:
  85. sentence_ints.append(vocab_to_int["<UNK>"])
  86. ints.append(sentence_ints)
  87. return ints
  88. def new_conv_layer(input, num_input_channels, filter_size, num_filters, use_pooling=True):
  89. shape = [filter_size, filter_size, num_input_channels, num_filters]
  90. weights = new_weights(shape)
  91. biases = new_biases(length=num_filters)
  92. if __name__ == "__main__":
  93. articles = []
  94. with open("./texts,txt") as f:
  95. for line in f:
  96. articles.append(line)
  97. clean_articles = []
  98. for article in articles:
  99. clean_articles.append(clean_text(article))
  100. word_count = {}
  101. for line in clean_articles:
  102. for word in line:
  103. if word not in word_count:
  104. word_count[word] = 1
  105. else:
  106. word_count[word] += 1
  107. # import the glove embedding
  108. embeddings_index = {}
  109. with open("./data/glove.6B/glove.6B.300d.txt", encoding='utf-8') as f:
  110. for line in f:
  111. values = line.split()
  112. word = values[0]
  113. embedding = np.asarray(values[1:], dtype='float32')
  114. embeddings_index[word] = embedding
  115. vocab_to_int = {}
  116. vocab_to_int = vocab_to_int_dict(word_count)
  117. int_articles = convert_to_ints(clean_articles, vocab_to_int)
  118. word_embedding_matrix = embedd_into_matrix(vocab_to_int, embeddings_index)
  119. # find maximum summary length
  120. max_summary_length = 0
  121. for line in int_articles:
  122. if len(line) > max_summary_length:
  123. max_summary_length = len(line)
  124. print("Max Summary Length: ", max_summary_length)
  125. padded_articles = []
  126. # add padding to all articles
  127. for line in int_summaries:
  128. adding = ["<PAD>"]*(max_summary_length - len(line))
  129. padded_articles.append(line.extend(adding))
  130. # creating the basic structure of the model(v1)
  131. # placeholder for inputs and outputs
  132. x_texts = tf.placeholder(tf.int32, [None, None], name='input_text')
  133. x_numeric = tf.placeholder(tf.float32, [None, None], name='input_numeric')
  134. targets = tf.placeholder(tf.float32, [None], name='targets')