"""Module: citation
This module contains functions which changes format or get details from citations. It also include functions to fix some
typos.
"""
import re
from typing import Literal, List, Dict, Any, Union
import pandas as pd
from systematic_review import string_manipulation, search_count
from systematic_review import converter
[docs]def citations_to_ris_converter(input_file_path: str, output_filename: str = "output_ris_file.ris",
input_file_type: str = "read_csv") -> None:
"""
This asks for citations columns name from tabular data and then convert the data to ris format.
Parameters
----------
input_file_path : str
this is the path of input file
output_filename : str
this is the name of the output ris file with extension. output file path is also valid choice.
input_file_type : str
this function default is csv but other formats are also supported by putting 'read_{file_type}'. such as
input_file_type = 'read_excel' all file type supported by pandas can be used by putting pandas IO tools methods.
for more info visit- https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html
Returns
-------
None
"""
df = getattr(pd, input_file_type)(input_file_path)
pd.set_option("display.max_columns", None)
print(df.head())
print("please specify column names for following data in input file:")
article_type = input("specify column for type of given citations. if it's journal article, input :JOUR")
authors = input("provide name of authors column")
publication_year = input("provide name of Publication Year column")
item_title = input("provide name of Item Title column")
publication_title = input("provide name of Publication Title column")
journal_volume = input("provide name of Journal Volume column")
journal_issue = input("provide name of Journal Issue column")
url = input("provide name of URL column")
doi = input("provide name of Item DOI column")
number_of_records = len(df)
output_file = open(output_filename, "a")
for index in range(number_of_records):
output_file.write(f"TY - {article_type}\n")
output_file.write("AU - " + str(df.iloc[index][authors]) + "\n")
output_file.write("PY - " + str(df.iloc[index][publication_year]) + "\n")
output_file.write("TI - " + str(df.iloc[index][item_title]) + "\n")
output_file.write("JO - " + str(df.iloc[index][publication_title]) + "\n")
output_file.write("VL - " + str(df.iloc[index][journal_volume]) + "\n")
output_file.write("IS - " + str(df.iloc[index][journal_issue]) + "\n")
output_file.write("UR - " + str(df.iloc[index][url]) + "\n")
output_file.write("DO - " + str(df.iloc[index][doi]) + "\n")
output_file.write("ER -" + "\n")
# \n is placed to indicate EOL (End of Line)
output_file.close()
print("ris file has been generated")
[docs]def edit_ris_citation_paste_values_after_regex_pattern(input_file_path: str, output_filename: str = "output_file.ris",
edit_line_regex: str = r'^DO ', paste_value: str = "ER - ") \
-> None:
"""
This is created to edit ris files which doesn't specify ER for 'end of citations' and paste ER after end point of
citation, replace 'DO' with other ris classifiers such as TY, JO etc.
Parameters
----------
input_file_path : str
this is the path of input file
output_filename : str
this is the name of the output ris file with extension.
edit_line_regex : str
this is the regex to find ris classifiers lines such as DO, TY, JO etc.
paste_value : str
this is value to be pasted, most helpful is ER ris classifier which signify citation end.
Returns
-------
None
"""
input_file = open(input_file_path, "r")
output_file = open(output_filename, "a")
for line in input_file:
output_file.write(line)
if re.match(edit_line_regex, line):
output_file.write(f"{paste_value}\n")
input_file.close()
output_file.close()
[docs]def get_details_via_article_name_from_citations(article_name: str, sources_name_citations_path_list_of_dict: list,
doi_url: bool = False, title_column_name: str = "title") -> dict:
"""Iterate through citations and find article_name and put source_name in column, with doi and url being optional
Parameters
----------
article_name : str
This is the primary title of the citation or name of the article.
sources_name_citations_path_list_of_dict : list
This is the list of all the sources names and it's citations at dir_path.
Examples - {'sources_name': 'all source articles citations', ...}
doi_url : bool
This signify if we want to get the value of url and doi from citation
title_column_name : str
This is the name of column which contain citation title
Returns
-------
dict
This dict contains the article_name, source_name and optional url and doi
"""
for citations in sources_name_citations_path_list_of_dict[1]:
if article_name == string_manipulation.preprocess_string(citations[title_column_name]):
article_title_source_name_dict = {"article_name": article_name,
"source_name": sources_name_citations_path_list_of_dict[0]}
if doi_url:
# optional block if you want url and doi
try:
article_title_source_name_dict["doi"] = citations["doi"]
article_title_source_name_dict["url"] = citations["url"]
except KeyError:
print("doi or url not present")
pass
return article_title_source_name_dict
[docs]def get_details_of_all_article_name_from_citations(filtered_list_of_dict: list,
sources_name_citations_path_list_of_dict: list,
doi_url: bool = False, title_column_name: str = "title") -> list:
"""This function searches source names, doi, and url for all articles in filtered_list_of_dict.
Parameters
----------
filtered_list_of_dict : list
This is the list of article citations dict after filtering it using min_limit on grouped_keywords_count
sources_name_citations_path_list_of_dict : list
This is the list of all the sources names and it's citations at dir_path.
Examples - {'sources_name': 'all source articles citations', ...}
doi_url : bool
This signify if we want to get the value of url and doi from citation
title_column_name : str
This is the name of column which contain citation title
Returns
-------
list
This list contains all article names with source names. (optional url and doi)
"""
all_articles_title_source_name_list_of_dict = []
for article_details in filtered_list_of_dict:
article_name = article_details[title_column_name]
print("article: ", article_name)
articles_title_source_name_dict = get_details_via_article_name_from_citations(
article_name, sources_name_citations_path_list_of_dict, doi_url)
all_articles_title_source_name_list_of_dict.append(articles_title_source_name_dict)
return all_articles_title_source_name_list_of_dict
[docs]def get_missed_articles_source_names(missed_articles_list: list, all_articles_title_source_name_list_of_dict: list,
article_column_name: str = "article_name",
source_column_name: str = "source_name") -> list:
"""
Parameters
----------
missed_articles_list : list
This contains the list of articles that got missed while downloading.
all_articles_title_source_name_list_of_dict : list
This list contains all article names with source names. (optional url and doi)
article_column_name : str
This is the name of article column in the all_articles_title_source_name_list_of_dict.
source_column_name : str
This is the name of source column in the all_articles_title_source_name_list_of_dict.
Returns
-------
list
This list contains articles_name and sources name.
"""
missed_article_name_and_source_name_list = []
for dict_element in all_articles_title_source_name_list_of_dict:
if dict_element[article_column_name] in set(missed_articles_list):
missed_article_name_and_source_name_list.append(
{"article_name": dict_element[article_column_name], "source_name": dict_element[source_column_name]})
return missed_article_name_and_source_name_list
[docs]def drop_columns_based_on_column_name_list(dataframe: pd.DataFrame, column_name_list: list) -> pd.DataFrame:
"""This function drop columns based on the column name in the list.
Parameters
----------
dataframe : pandas.DataFrame object
This dataframe contains columns which we want to drop or remove.
column_name_list : list
This is the name of dataframe columns to be removed
Returns
-------
pandas.DataFrame object
DataFrame with columns mentioned in column_name_list removed.
"""
output_df = dataframe.drop(column_name_list, axis=1)
return output_df
[docs]def drop_search_words_count_columns(dataframe, search_words_object: search_count.SearchWords) -> pd.DataFrame:
"""removes columns created based on the keywords.
Parameters
----------
dataframe : pandas.DataFrame object
This dataframe contains keywords columns which we want to drop or remove.
search_words_object : dict
This is the dictionary comprised of unique keywords in each keyword groups. It means keyword from first keyword
group can not be found in any other keyword group.
Example - {'keyword_group_1': ["management", "investing", "risk", "pre", "process"], 'keyword_group_2':
["corporate", "pricing"],...}
Returns
-------
pandas.DataFrame object
DataFrame with keywords columns removed.
"""
keywords_count_cols = search_words_object.get_sorting_keywords_criterion_list()
cleaned_dataframe = drop_columns_based_on_column_name_list(dataframe, keywords_count_cols)
return cleaned_dataframe
[docs]def add_multiple_sources_column(citation_dataframe: pd.DataFrame, group_by: list = ['title', 'year']) -> pd.DataFrame:
"""This function check if citations or article title is available at more than one sources and add column named
'multiple_sources' to the dataframe with list of name of sources names.
Parameters
----------
citation_dataframe : pandas.DataFrame object
Input dataset which contains citations or article title with sources more than one.
group_by : list
column label or sequence of labels, optional Only consider certain columns for citations or article title with
sources more than one, by default use all of the columns.
Returns
-------
pandas.DataFrame object
DataFrame with additional column with list of sources names
"""
df = citation_dataframe.groupby(group_by)['source'].apply(list).reset_index()
df = df.rename(columns={"source": "multiple_sources"})
citation_dataframe_with_multiple_sources_column = citation_dataframe.merge(df, how='left', on=group_by)
return citation_dataframe_with_multiple_sources_column
[docs]def add_citation_text_column(dataframe_object: pd.DataFrame, title_column_name: str = "title",
abstract_column_name: str = "abstract",
keyword_column_name: str = "keywords") -> pd.DataFrame:
"""This takes dataframe of citations and return the full text comprises of "title", "abstract",
"search_words_object"
Parameters
----------
dataframe_object : pandas.DataFrame object
this is the object of famous python library pandas. for more lemma_info: https://pandas.pydata.org/docs/
title_column_name : str
This is the name of column which contain citation title
abstract_column_name : str
This is the name of column which contain citation abstract
keyword_column_name : str
This is the name of column which contain citation search_words_object
Returns
-------
pd.DataFrame
this is dataframe_object comprises of full text column.
"""
dataframe_object["citation_text"] = dataframe_object[title_column_name].astype(str) + " " + dataframe_object[
abstract_column_name].astype(str) + " " + dataframe_object[keyword_column_name].astype(str)
return dataframe_object
[docs]def drop_duplicates_citations(citation_dataframe: pd.DataFrame, subset: list = ['title', 'year'],
keep: Literal["first", "last", False] = 'first',
index_reset: bool = True) -> pd.DataFrame:
"""Return DataFrame with duplicate rows removed. Considering certain columns is optional. Indexes, including time
indexes are ignored.
Parameters
----------
index_reset : bool
It
citation_dataframe : pandas.DataFrame object
Input dataset which contains duplicate rows
subset : list
column label or sequence of labels, optional Only consider certain columns for identifying duplicates, by
default use all of the columns.
keep : str
options includes {'first', 'last', False}, default 'first'. Determines which duplicates (if any) to keep.
- ``first`` : Drop duplicates except for the first occurrence.
- ``last`` : Drop duplicates except for the last occurrence.
- False : Drop all duplicates.
Returns
-------
pandas.DataFrame object
DataFrame with duplicates removed
"""
clean_df = citation_dataframe.drop_duplicates(subset=subset, keep=keep).reset_index(drop=index_reset)
return clean_df
[docs]class Citations:
def __init__(self, citations_files_parent_folder_path, title_column_name: str = "title",
text_manipulation_method_name: str = "preprocess_string_to_space_separated_words"):
"""
Parameters
----------
citations_files_parent_folder_path : str
this is the path of parent folder of where citations files exists.
title_column_name
text_manipulation_method_name
"""
self.text_manipulation_method_name = text_manipulation_method_name
self.title_column_name = title_column_name
self.citations_files_parent_folder_path = citations_files_parent_folder_path
[docs] def create_citations_dataframe(self) -> pd.DataFrame:
"""Executes citation step.
This function load all the citations from path, add required columns for next steps, and remove duplicates.
Returns
-------
pandas.DataFrame object
DataFrame with additional columns needed for next steps of systematic review and duplicates are removed
"""
full_list = converter.load_multiple_ris_citations_files(self.citations_files_parent_folder_path)
full_list_df = converter.records_list_to_dataframe(full_list)
complete_df = add_multiple_sources_column(full_list_df)
complete_df = add_citation_text_column(complete_df)
new_column_name = "cleaned_" + self.title_column_name
complete_df = converter.apply_custom_function_on_dataframe_column(complete_df,
self.title_column_name,
string_manipulation.text_manipulation_methods,
new_column_name,
self.text_manipulation_method_name)
complete_citations_df = drop_duplicates_citations(complete_df)
return complete_citations_df
[docs] def get_records_list(self) -> List[Dict[str, Any]]:
"""Executes citation step.
This function load all the citations from path, add required columns for next steps, and remove duplicates.
Returns
-------
List[Dict[str, Any]]
list with additional columns needed for next steps of systematic review and duplicates are removed
"""
return converter.dataframe_to_records_list(self.create_citations_dataframe())
[docs] def get_dataframe(self):
"""executes the create citations dataframe function and outputs the pd.DataFrame
Returns
-------
pd.DataFrame
outputs the citations data.
"""
return self.create_citations_dataframe()
[docs] def to_csv(self, output_filename: Union[str, None] = "output.csv", index: bool = True):
"""This function saves pandas.DataFrame to csv file.
Parameters
----------
output_filename : str
This is the name of output file which should contains .csv extension
index : bool
Define if index is needed in output csv file or not.
Returns
-------
"""
converter.dataframe_to_csv_file(self.get_dataframe(), output_filename, index)
[docs] def to_excel(self, output_filename: Union[str, None] = "output.csv", index: bool = True):
"""This function saves pandas.DataFrame to excel file.
Parameters
----------
output_filename : str
This is the name of output file which should contains .xlsx extension
index : bool
Define if index is needed in output excel file or not.
Returns
-------
"""
converter.dataframe_to_excel_file(self.get_dataframe(), output_filename, index)