import numpy as np
= np.array([
database
{'id': 'string', # unique identifier for the paper following convention P2_#number
'title': 'string', # title of the paper
'AffiliationCountry': 'string' , #name of country the study was conducted in,
'year': 2018-2024, # year of publication a value between 2018 and 2024
'journal': 'string', # name of the journal the paper was published in
'citations': 0-1000, # number of citations the paper has received - not reported in the paper
'year_since': 3, # number of years since publication - not reported in the paper
'cpy': 0, # number of citations per year - not reported in the paper
'keywords': ['TAM', 'mbanking', 'awareness'], # list of keywords, broken into K1-K10
'abstract': 'string', # abstract of the paper
'F': ['perceived usefulness'], # factors significant in the study, broken into F1-F9
'FN': ['another factor'], # factors not significant in the study, broken into FNS1-FNS4
'limit': ['geographical context'], # limitations of the study, broken into LIMIT1-LIMIT3
'typeofResearch': 'string', # type of research conducted in the study
'methods': ['regression analysis'], # methods used in the study, broken into METHOD1-METHOD4
'theory': ['TAM'] # theories used in the study, broken into THEORY1-THEORY4
'sampleSize': 100, # sample size of the study
'tech': 'string', # main technology studied
'man_theme': 'string', # Theme manually assigned by me
'algo_theme': 'string', # Theme assigned by the algorithm
'decision_Theme': 'string', # Final theme of the paper
'Score_Sig': 0.0, # % of significance for factors
'Score_NOT_Sig': 0.0, # % of non-significance for factors
} ])
Algorithmic Approach to Finding Themes
Part 0. Jupyter Notebook
If you want to run the entire code, use the Jupyter notebook on my github page.
Part 1. Data Collection
I downloaded the pdf of all the papers (143), reading them and extracting meta data based on the following:
Idea for future
🤖 Build an Agentic AI application that automates this process.
Part 1.1 Finding Out Themes
First, install the following Python modules. I saved the pdf files’ name in a dictionary like this:
= {
name_of_pdfs 'p2_01': "Lonkani et al_2020_A comparative study of trust in mobile banking.pdf",
'p2_02': "Saprikis et al_2022_A comparative study of users versus non-users' behavioral intention towards.pdf",
'p2_03': "Malaquias et al_2021_A cross-country study on intention to use mobile banking.pdf",
'p2_04': "Merhi et al_2019_A cross-cultural study of the intention to use mobile banking between Lebanese.pdf",
'p2_05': "Frimpong et al. - 2020 - A cross‐national investigation of trait antecedent.pdf",
# and so on ...
}
Additionally, I defined a dictionary “look up” for all the factors in the dataset with their related theme that looks like this (shortened for this presentation):
= {
theme_of_words 'demographic':
list(set(['women', 'woman', 'female', 'men', 'man', 'male', 'sex', 'gender', 'age', 'income',
'demographic variables', 'elderly', 'education', 'gender differences', 'generation y', 'millennial generation',
'millennial', 'gen y', 'gen Z', 'gen alpha', 'gen X', 'boomer', 'babyboomer', 'generation X', 'generation z',
'young consumers',
# A lot more factors ...
])),
#----------------------------------------------------------------------------------------------------------------------------------
'cultural':
list(set(['developing countries','malaysia','transition country','pakistan',
'zakat','developing country','ghana','USA','srilanka', 'sri lanka',
'india','maldives','saudi-arabia','saudi arabia', 'nigeria','thailand','united states',
'yemen','citizenship','zimbabwe','palestine','culture',
'Country perspective',
# ...
])),
#----------------------------------------------------------------------------------------------------------------------------------
'psychological':
list(set(['anxiety','satisfaction','behavior','behaviour','attitudes','attitude','awareness',
'technology anxiety','consumer-behavior','trust','benv','consumer behaviour',
'covid-19 related psychological distress','psychological distress','psychological','distress',
'behavioral','computer anxiety','customer satisfaction', 'cognitive resistance',
# A LOT more ...
]))
, # ... few other key value pairs corresponding to themes
}
I also needed to delete some stop words, and decided to add more words that I knew would be frequently repeated. I also define the lemmer and stemmer.
= stopwords.words('english')
stop_words "bank", "banking", "banks",
stop_words.extend(["mobile", "mbank", "mbanking", "m-bank", "m bank",
"online", "e", "e-bank", "ebank", "mobile banking", "mobile bank",
"adoption", "acceptance", "accept", "theory",
"purpose", "result", "method", #from abstracts
"journal", "volume", "pp", "no", "doi", "http", "https", "et al", "issue",
"technology", "internet", "information system", "international information",
"information technology", "computer human", "mis quarterly", "electornic commerce",
"j market", "telematics and informatics", "telematics informatics", "retail consumer",
"international volume", "international business", "global information",
"et", "al", "al.", "tam", "sem", "pls", "utaut", "tpb",
".com", "management", "marketing", "published", "study",
"research", "literature", "model", #from journal information
"app", "application", "usage"])
= PorterStemmer()
stemmer = WordNetLemmatizer() lemmatizer
Set up.
So, I need a few functions as set up for cleaning the text. Function extract_text_from_pdf()
is using PyMuPDF
to extract text from a PDF file.
#version one using PyMuPDF
def extract_text_from_pdf(filename):
= ""
text try:
= fitz.open(filename)
doc for page_num in range(doc.page_count):
= doc.load_page(page_num)
page += page.get_text()
text except Exception as e:
print(f"Error reading {filename}: {e}")
return text
This function is just one of the data cleaning functions: For Topic modeling, I write a function to generate dictionaries and save them in a .mm
file format.
def generate_dictionary(text, name):
"""
As input takes in the text to build the dictionary for and the name of a .mm file
"""
= Dictionary(text)
dictionary
= [dictionary.doc2bow(review) for review in text]
corpus
= f"{name}.mm"
filename
MmCorpus.serialize(filename, corpus)
return dictionary, corpus
Additionally, I want a function that prints the top 50 most frequently appearing words in the corpus:
I also plan on seeing how python clusters the words (as in, finds similar words) vs me: This is a function for if you want to use a word embedding (requires some effort, time and machine power!):
def get_embedding(text):
= BertTokenizer.from_pretrained('bert-base-uncased')
tokenizer = BertModel.from_pretrained('bert-base-uncased')
model_bert
= tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=20)
inputs with torch.no_grad():
= model_bert(**inputs)
outputs return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
And then you use this to get semantically similar words:
def get_semantically_similar_words(words, threshold=0.7):
= set(words)
similar_words for word in words:
= nlp(word)
token for vocab_word in nlp.vocab:
if vocab_word.has_vector and vocab_word.is_alpha:
= token.similarity(nlp(vocab_word.text))
similarity if similarity >= threshold:
similar_words.add(vocab_word.text)return similar_words
So, how do I find the themes? Essentially, I just tweaked TF-IDF:
class CustomTfidfVectorizer(TfidfVectorizer):
def __init__(self, vocabulary=None, **kwargs):
super().__init__(vocabulary=vocabulary, **kwargs)
#self.general_keywords = set(general_keywords)
def build_analyzer(self):
= super().build_analyzer()
analyzer return lambda doc: [w for w in analyzer(doc)] #if w not in self.general_keywords]
def fit(self, raw_documents, y=None):
self.fit_transform(raw_documents, y)
return self
def fit_transform(self, raw_documents, y=None):
= super().fit_transform(raw_documents, y)
X self.max_frequencies = self._compute_max_frequencies(X, raw_documents)
return X
def transform(self, raw_documents):
= super().transform(raw_documents)
X
# Calculate augmented term frequency
= self.max_frequencies
max_frequencies == 0] = 1 # Avoid division by zero
max_frequencies[max_frequencies = 0.5 + 0.5 * (X.toarray() / max_frequencies[:, None])
augmented_tf
# Penalize general keywords
#penalized_idf = self.idf_ * (1 - 0.8 * np.isin(self.get_feature_names_out(), list(self.general_keywords)))
# Apply penalized IDF
= augmented_tf * penalized_idf
augmented_tfidf
return csr_matrix(augmented_tfidf)
def _compute_max_frequencies(self, X, raw_documents):
= np.zeros(X.shape[0])
max_frequencies for i, doc in enumerate(raw_documents):
= {}
term_freq for term in doc.split():
if term in term_freq:
+= 1
term_freq[term] else:
= 1
term_freq[term] = max(term_freq.values())
max_frequencies[i] return max_frequencies
Keyword Analytics
try:
= pd.read_csv("P2_AR_04.csv", encoding='utf-8')
df except UnicodeDecodeError:
try:
= pd.read_csv("P2_AR_04.csv", encoding='latin-1')
df except Exception as e:
= str(e)
error_message = None df
Clean all the data you’ve gathered the same way the PDF’s have been cleaned (the preprocess_text()
function looks very similar to the cleaning function above!):
= df.copy()
df2
= ['Man_Theme',
columns_to_preprocess 'K1','K2','K3','K4','K5','K6','K7','K8','K9','K10',
'F1','F2','F3','F4','F5','F6','F7','F8','F9',
'FNS1','FNS2','FNS3','FNS4',
'METHOD1','METHOD2','METHOD3','METHOD4',
'THEORY1','THEORY2','THEORY3','THEORY4',
'LIMIT1' ,'LIMIT2' ,'LIMIT3', 'Abstract'
]
for col in columns_to_preprocess:
= df2[col].apply(preprocess_text) df2[col]
= {}
papers
for paper_id, filename in name_of_pdfs.items():
= extract_text_from_pdf(filename)
text = text
papers[paper_id]
= pd.DataFrame.from_dict(papers, orient = 'index', columns = ['paperText'])
papers_df = papers_df.reset_index(names = ['paperID'])
papers_df 'papers_unclean.csv')
papers_df.to_csv( papers_df.head()
These look like this:
# keep a copy (a habit of mine)
= papers.copy()
papers_uncleaned = theme_of_words.copy()
theme_of_words_uncleaned
# clean up all papers
= preprocess_Dict(papers)
papers_cleaned = pd.DataFrame.from_dict(papers_cleaned, orient = 'index', columns = ['paperText'])
papersClean_df = papersClean_df.reset_index(names = ['paperID'])
papersClean_df 'papers_clean3.csv')
papersClean_df.to_csv( papersClean_df.head()
Clean the theme of words dictionary so the words match:
= {}
theme_of_words_cleaned
for k, v in theme_of_words.items():
= preprocess_list(v)
theme_of_words_cleaned[k]
'psychological'][:10] theme_of_words_cleaned[
[‘initial trst’,‘simcong’,‘hedmotiv’,‘selfefficacy’,‘strss’,‘controlled motiv’,‘cong’,‘selfcong cong’,‘trst systems’,‘custom loyalti’]
Make sure to drop all NA’s and empty values:
for k, v in theme_of_words_cleaned.items():
= [x for x in v if x not in [None, "", ' ', 'NaN'] and not (isinstance(x, float) and math.isnan(x))] theme_of_words_cleaned[k]
Themes Based on Count of Words in Each Group
I will skip the parts on BERT and word embeddings that’s in the jupyter notebook as these were not used for my project. The reason is that it was taking so long and my computer simply did not have the capacity to handle it. I also didn’t have the time to find a fix for it, but there are a bunch of commented-out code from chatGPT that I was playing around with.
So, we now have a dictionary with the factors and their theme, and a corpus of text of all papers in the dataset. What I’m going to do here is:
- Count all the words
total_words
in each paper - Count the instances of each word (factor) in
theme_of_words_cleaned
in each paper - Each word’s count adds 1 point to its corresponding theme’s “weight score”
- Take for example the word emotion and I said the theme for this word is psychological. If emotion shows up 10 times in paper 1, paper 1’s dictionary of weights has a weight of 10 for (divided by the total number of words) psychological.
= []
results = pd.DataFrame(results)
count_words_df
for doc_id, text in papers_cleaned.items():
= nlp(text)
doc = defaultdict(int)
word_counts
for token in doc:
for group, keywords in theme_of_words_cleaned.items():
if token.text.lower() in keywords:
+= 1
word_counts[group]
= len(doc)
total_words = {f"{group}_w": count / total_words for group, count in word_counts.items()}
group_weights = max(group_weights.values(), default=0)
max_weight = max(group_weights, key=group_weights.get).replace("_w", "") if max_weight > 0 else None
theme
= {"doc_id": doc_id, **word_counts, **group_weights, "theme": theme, "max_weight": max_weight}
result
results.append(result)
for group in theme_of_words_cleaned.keys():
if group not in count_words_df.columns:
= 0
count_words_df[group] if f"{group}_w" not in count_words_df.columns:
f"{group}_w"] = 0.0
count_words_df[
count_words_df.head()
This is still very basic, though because it’s only based on the factors which may not be fully representative. So, I do this again using keywords in addition to the factors. These are keywords that were selected by the authors as well as information extracted from Web of Science \BibTeX file.
= ['K1','K2','K3','K4','K5','K6','K7','K8','K9','K10','F1','F2','F3','F4','F5','F6','F7','F8','F9']
cols_toPick
= df2.loc[:,cols_toPick]
keywordsDf
# flatten the dataframe to a list
= keywordsDf.values.flatten().tolist()
keywords_across_db
# there are 2,717 words here
print("number of words (factors and keywords) in total ", len(keywords_across_db))
# making sure there are no empty/NaN/Null values
= [x for x in keywords_across_db if x not in [None, "", ' ', 'NaN'] and not (isinstance(x, float) and math.isnan(x))]
keywords_across_db
# making sure there are no duplicates (set takes care of this)
= list(set(keywords_across_db))
keywords_across_db_nodup_cleaned
# convert the list into a dictionary temporarily, then convert it to a dataframe
= {'Keyword': keywords_across_db_nodup_cleaned}
temp = pd.DataFrame(temp, columns=['Keyword'])
keywords_themes_df
# go back to the theme_of_words_cleaned and find each keyword's theme
= {keyword: theme for theme, keywords in theme_of_words_cleaned.items() for keyword in keywords}
keywords_themes_dic
# This is just a dataframe view of the keywords with their respective theme
'Theme'] = keywords_themes_df['Keyword'].map(keywords_themes_dic)
keywords_themes_df[
# if the theme is empty, give it "Generic" - that means these keywords weren't in the list of important words that we picked themes for
'Theme'] = keywords_themes_df['Theme'].apply(lambda x: 'Generic' if pd.isna(x) or x == ' ' else x)
keywords_themes_df[
keywords_themes_df.head()
I’m gonna get rid of all the Generic keywords, so keeping a copy of this dataframe:
= keywords_themes_df.copy()
keywords_themes_df_withGenerics
'Theme'] = keywords_themes_df['Theme'].apply(lambda x: 'Generic' if pd.isna(x) or x == ' ' else x)
keywords_themes_df[
# everything but generic
= keywords_themes_df.loc[keywords_themes_df['Theme'] != 'Generic']
keywords_themes_df
# flatten it to build a vocabulary
= keywords_themes_df['Keyword'].values.flatten().tolist()
words_acrossAll_nonGeneric
# making sure no null values were generated
= [x for x in words_acrossAll_nonGeneric if x not in [None, "", ' ', 'NaN'] and not (isinstance(x, float) and math.isnan(x))]
words_acrossAll_nonGeneric
# making sure there are no dulicates (233 words total)
= list(set(words_acrossAll_nonGeneric)) words_acrossAll_nonGeneric
Theme Assignment
I will now use the custom TF-IDF class to generate a TF-IDF matrix. This is similar to what I did by hand a bit further above. Basically, all TF-IDF is doing is counting the frequency of words across the document.
= CustomTfidfVectorizer(vocabulary = words_acrossAll_nonGeneric)
vectorizer_keys
= vectorizer_keys.fit_transform(papers_cleaned.values())
tfidf_matrix
= pd.DataFrame(tfidf_matrix.toarray(), index = papers_cleaned.keys(), columns=vectorizer_keys.get_feature_names_out())
tfidf_df
tfidf_df.head()
Now using TF-IDF scores, finding the weights for themes for each paper:
= {keyword: theme for theme, keywords in theme_of_words_cleaned.items() for keyword in keywords}
keyword_to_theme
= pd.DataFrame(0, index=tfidf_df.index, columns=theme_of_words_cleaned.keys())
theme_weights
for keyword, theme in keyword_to_theme.items():
if keyword in tfidf_df.columns:
+= tfidf_df[keyword]
theme_weights[theme]
for _, row in df2.iterrows():
= row['ID']
paper_id = words_acrossAll_nonGeneric
keywords
for keyword in keywords:
if keyword in tfidf_df.columns:
= keyword_to_theme.get(keyword, None)
theme if theme:
#if theme in tfidf_df.index:
+= tfidf_df.at[paper_id, keyword] * 5
theme_weights.at[paper_id, theme]
# this is picking just 1 theme per paper - find the theme with the maximum weight as the main theme of the paper
= theme_weights.apply(lambda row: (row == row.max()).astype(int), axis=1)
main_theme_df
main_theme_df.head()
Visualizing this:
= plt.subplots(nrows=2, ncols=1, figsize=(20, 10))
fig, axes
# Heatmap for theme weights
=axes[0], cmap="YlGnBu_r", cbar_kws={'label': 'Weight'})
sns.heatmap(theme_weights.T, ax0].set_xlabel("Paper ID")
axes[0].set_ylabel("Theme")
axes[0].set_title("Theme Weights per Paper")
axes[
# Heatmap for main themes
=axes[1], cmap="YlGnBu_r", cbar_kws={'label': 'Theme Presence'})
sns.heatmap(main_theme_df.T, ax1].set_xlabel("Paper ID")
axes[1].set_ylabel("Theme")
axes[1].set_title("Main Theme per Paper")
axes[
plt.tight_layout()
plt.show()'main_themes_heatmap_1.png') plt.savefig(
To extract the themes easily:
# Extract themes for each paper
= {
themes_for_papers bool)].tolist()
paper_id: main_theme_df.columns[row.astype(for paper_id, row in main_theme_df.iterrows()
}
# Print the themes for each paper
for paper_id, themes in themes_for_papers.items():
print(f"Paper ID: {paper_id}, Themes: {', '.join(themes)}")
Paper ID: p2_01, Themes: psychological
Paper ID: p2_02, Themes: psychological
Paper ID: p2_03, Themes: psychological
Paper ID: p2_04, Themes: psychological
Paper ID: p2_05, Themes: psychological
Paper ID: p2_06, Themes: perceptive
Paper ID: p2_07, Themes: personal
Paper ID: p2_08, Themes: personal
Paper ID: p2_09, Themes: perceptive
Paper ID: p2_10, Themes: psychological
Since Topic Modeling is Multimembership, papers can have more than just 1 theme. Since I didn’t use a word embedding and may not get the best representatitve theme here, I decided to allow for up to 3 themes. To do this, I changed the CustomTfidfVectorizer
class.
ALLOWING FOR MULTIPLE THEMES
class CustomTfidfVectorizerUpdateClass(TfidfVectorizer):
def __init__(self, theme_keywords, threshold=0.8, **kwargs):
# Generate vocabulary from theme_keywords
= list(set(word for words in theme_keywords.values() for word in words))
vocabulary super().__init__(vocabulary=vocabulary, **kwargs)
self.threshold = threshold # Threshold for determining multiple themes
self.theme_keywords = theme_keywords # Store theme_keywords for later use
def build_analyzer(self):
= super().build_analyzer()
analyzer return lambda doc: [w for w in analyzer(doc)]
def fit(self, raw_documents, y=None):
self.fit_transform(raw_documents, y)
return self
def fit_transform(self, raw_documents, y=None):
= super().fit_transform(raw_documents, y)
X self.max_frequencies = self._compute_max_frequencies(X, raw_documents)
return X
def transform(self, raw_documents):
= super().transform(raw_documents)
X
# Calculate augmented term frequency
= self.max_frequencies
max_frequencies == 0] = 1 # Avoid division by zero
max_frequencies[max_frequencies = 0.5 + 0.5 * (X.toarray() / max_frequencies[:, None])
augmented_tf
= augmented_tf # No penalized IDF applied here
augmented_tfidf
return csr_matrix(augmented_tfidf)
def determine_themes(self, documents_dict):
"""
Determines the themes for each document based on TF-IDF scores.
A paper can have multiple themes if the scores are within the threshold.
:param documents_dict: Dictionary of documents (keys: IDs, values: text)
:return: Dictionary where keys are document IDs and values are lists of themes
"""
= list(documents_dict.keys())
document_ids = list(documents_dict.values())
raw_documents
= self.transform(raw_documents).toarray()
X = {name: i for i, name in enumerate(self.get_feature_names_out())}
feature_name_to_index
= {}
theme_scores for doc_index, doc_vector in enumerate(X):
= document_ids[doc_index]
doc_id # Calculate scores for each theme
= {
scores sum(doc_vector[feature_name_to_index[word]]
theme: for word in keywords if word in feature_name_to_index)
for theme, keywords in self.theme_keywords.items()
}= max(scores.values()) if scores else 0
max_score
# Determine themes within the threshold
= [
selected_themes for theme, score in scores.items()
theme if score >= self.threshold * max_score
]= selected_themes
theme_scores[doc_id]
return theme_scores
def _compute_max_frequencies(self, X, raw_documents):
= np.zeros(X.shape[0])
max_frequencies for i, doc in enumerate(raw_documents):
= {}
term_freq for term in doc.split():
if term in term_freq:
+= 1
term_freq[term] else:
= 1
term_freq[term] = max(term_freq.values())
max_frequencies[i] return max_frequencies
Similar to the above task, I generate the vectorizer from theme_of_words_cleaned
vocabulary, but this time, allow for a few more themes (threshold = 0.75
).
= CustomTfidfVectorizerUpdateClass(theme_keywords = theme_of_words_cleaned, threshold = 0.75)
vectorizer_keys2
= vectorizer_keys2.fit_transform(papers_cleaned.values())
tfidf_matrix2
= pd.DataFrame(tfidf_matrix2.toarray(), index=papers_cleaned.keys(), columns=vectorizer_keys2.get_feature_names_out())
tfidf_df2 tfidf_df2.head()
You can still pick a main theme for each paper, but I want to see the top 3 themes:
= {keyword: theme for theme, keywords in theme_of_words_cleaned.items() for keyword in keywords}
keyword_to_theme2
= pd.DataFrame(0, index=tfidf_df2.index, columns=theme_of_words_cleaned.keys())
theme_weights2
for keyword, theme in keyword_to_theme2.items():
if keyword in tfidf_df2.columns:
+= tfidf_df2[keyword]
theme_weights2[theme]
for _, row in df2.iterrows():
= row['ID']
paper_id #for keyword in words_acrossAll_nonGeneric:
for keyword in vectorizer_keys2.get_feature_names_out():
if keyword in tfidf_df2.columns:
= keyword_to_theme2.get(keyword, None)
theme if theme:
# the 5 I'm adding here is just to make the weights a bit larger for visualization. It's a simple scaling and won't change the results
+= tfidf_df2.at[paper_id, keyword] * 5
theme_weights2.at[paper_id, theme]
= {
top_3_themes_for_papers
paper_id: theme_weights2.loc[paper_id]=False)[:3]
.sort_values(ascending
.index.tolist()for paper_id in theme_weights2.index
}
for paper_id, themes in top_3_themes_for_papers.items():
print(f"Paper ID: {paper_id}, Top 3 Themes: {', '.join(themes)}")
Paper ID: p2_01, Top 3 Themes: psychological, cultural, demographic
Paper ID: p2_02, Top 3 Themes: psychological, market, personal
Paper ID: p2_03, Top 3 Themes: psychological, personal, perceptive
Paper ID: p2_04, Top 3 Themes: psychological, personal, perceptive
Paper ID: p2_05, Top 3 Themes: cultural, personal, psychological
Paper ID: p2_06, Top 3 Themes: perceptive, cultural, psychological
Paper ID: p2_07, Top 3 Themes: personal, psychological, demographic
Paper ID: p2_08, Top 3 Themes: psychological, personal, perceptive
Paper ID: p2_09, Top 3 Themes: perceptive, psychological, personal
Paper ID: p2_10, Top 3 Themes: psychological, cultural, personal
Keep a copy and save this to file:
= df2.copy()
df3
= {
top_3_themes_for_papers ", ".join(theme_weights2.loc[paper_id]
paper_id: =False)[:3]
.sort_values(ascending
.index.tolist())for paper_id in theme_weights2.index
}
"Algo_Theme"] = df2["ID"].map(top_3_themes_for_papers)
df3[
"NewWithThemes.csv") df3.to_csv(