CourseProject/load.py at main · vickyli99/CourseProject

executable file
169 lines (133 loc) · 4.6 KB
# -*- coding: utf-8 -*-
"""bts.ipynb
Automatically generated by Colaboratory.
Original file is located at
    https://colab.research.google.com/drive/1SiePL-1XjGJ5rkRYbGeK7n__VuMJ0iYd
import json
import nltk
from nltk.corpus import stopwords
from nltk.corpus import wordnet
import numpy as np
import scipy
from scipy.special import digamma, gammaln
import string
import nltk
nltk.download('punkt')
import pandas as pd
import json
with open('result.json') as f:
   data = json.load(f)
df = pd.DataFrame(data)
def parseWords(content): # Use nltk and stopwords to tokenize words
  tokenizedWords = []
  for sentence in nltk.sent_tokenize(content):
    stemmedWords = [stemmer.lemmatize(w.lower()) for w in nltk.word_tokenize(sentence) if w not in string.punctuation]
    tokenizedWords += [v for v in stemmedWords if v not in stopWords] # Remove stopwords
  return tokenizedWords
content = df['Content']
stemmer = nltk.wordnet.WordNetLemmatizer()
stopWords = set(stopwords.words('english'))
sentence_Vocabulary = []
for item in content:
    temp = parseWords(item)
    sentence_Vocabulary.append(temp)
FreqDist(sentence_Vocabulary[0])
review_data = dict(df)
    def __init__(self, stn):
        self.stn = FreqDist(stn)
        self.label = -1
class Review:
    def __init__(self, each_review_data):
        content = each_review_data.get("Content")
        stn_word = sentence_Vocabulary
        self.Stns = [Stn(stn) for stn in stn_word]
        single_word_dict = {}
        for stn in self.Stns:
            single_word_dict = single_word_dict | stn.stn.keys()
        self.single_word_dict = np.array([w for w in single_word_dict])
        self.single_word_dict.sort()
        self.NumOfsingle_word_dict = len(self.single_word_dict)
class Reviews:
    def __init__(self, data):
        self.Reviews = [Review(review) for review in data]
def countmacth(stn,aspects):  
    count = np.zeros(len(aspects))
    index=0
    for aspect in aspects:
        for word in stn.stn.keys():
            if word in aspect:
                count[index]=count[index]+1
        index=index+1
    return count
class Corpus:
    def __init__(self, review_data):
        self.Vocab = Review(review_data).single_word_dict
        self.Vlength = len(self.Vocab)
        self.aspect_words = []
        self.Reviews = [Review(each_review_data) for each_review_data in data]
def statistics(review,aspect):
    review.num_stn = 0
    review.num_stn_aspect = np.zeros(K)
    review.num_stn_word = np.zeros(review.NumOfsingle_word_dict)
    review.aspectwordlength = np.zeros((K,review.NumOfsingle_word_dict))
    for stn in review.Stns:
        if stn.label != -1:
            review.num_stn = review.num_stn + 1
    def sentence_label(self,corpus):
        for review in rest.Reviews:
            for stn in review.Stns:
                count=countmacth(stn,self.aspect_words)
                if count>0:
                    label = np.where(np.max(count)==count)[0].tolist()
                    stn.label = label 
    def chisq(self,corpus):
        aspectwordlength = np.zeros((K,V))
        aspectlength = np.zeros(K)
        stnword = np.zeros(V)
        stnnum = 0
        Chi_sq = np.zeros((K,V))
        for k in range(K):
            for w in range(V):
                A = corpus.aspectwordlength[k,w]  ## term & aspec
                B = corpus.stnword[w] - corpus.aspectwordlength[k,w]  ## t & !a
                C = corpus.aspectlength[k] - corpus.aspectwordlength[k,w]
                D = corpus.stnnum - A - B - C
                N = corpus.stnnum
                Chi_sq[k,w] = N * ( A * D - B * C ) * ( A * D - B * C)
        self.Chi_sq = Chi_sq
def load_aspect_words(output):
    aspect = ["value", "room", "location", "cleanliness", "service"]
    output.aspect_words = aspect
def create_W_mat(review,corpus):
    review.W = np.zeros((K,len(review.single_word_dict)))
    for k in range(5):
        for w in range(1588):   
          review.W[k,w] = review.num_stn_aspect[k,w]/corpus.num_stn_word[k]
def create_all_W(corpus):
    for review in corpus.Reviews:
        create_W_matrix_for_each_review(review,corpus)
def load_data_vocab(output,corpus,outputpath):
    dir = outputpath
    if not os.path.exists(dir):
        os.makedirs(dir)
    vocabfile = outputpath+"vocab1.txt"
    f = open(vocabfile,"w")
    for w in corpus.Vocab:
        f.write(w+",")
    f.close()
def load_data_review(output,corpus,outputpath):
    reviewdata = outputpath+"reviewdata.txt"
    f = open(reviewfile, 'w')
    for review in corpus.Reviews:
        f.write(review.Content)
    f.close()
Provide feedback

Saved searches

Use saved searches to filter your results more quickly

FilesExpand file tree

load.py

Latest commit

History

load.py

File metadata and controls