"""Module: converter
This module contains functions related to files and data type conversion. such as list to txt file, pandas df to list of
dicts and many more.
"""
import json
from collections import defaultdict
from typing import Union, List, Dict, Any
import pandas as pd
import rispy
from systematic_review import os_utils, string_manipulation
[docs]def dataframe_to_csv_file(dataframe_object: pd.DataFrame, output_filename: Union[str, None] = "output.csv",
index: bool = True):
"""
This function saves pandas.DataFrame to csv file.
Parameters
----------
dataframe_object : pandas.DataFrame object
this is the object of python library pandas. for more lemma_info: https://pandas.pydata.org/docs/
output_filename : str
This is the name of output file which should contains .csv extension
index : bool
Define if index is needed in output csv file or not.
Returns
-------
"""
dataframe_object.to_csv(output_filename, index=index)
[docs]def dataframe_to_excel_file(dataframe_object: pd.DataFrame, output_filename: Union[str, None] = "output.csv",
index: bool = True):
"""
This function saves pandas.DataFrame to excel file.
Parameters
----------
dataframe_object : pandas.DataFrame object
this is the object of python library pandas. for more lemma_info: https://pandas.pydata.org/docs/
output_filename : str
This is the name of output file which should contains .xlsx extension
index : bool
Define if index is needed in output excel file or not.
Returns
-------
"""
dataframe_object.to_excel(output_filename, index=index)
[docs]def dataframe_to_records_list(dataframe: pd.DataFrame) -> List[Dict[str, Any]]:
"""converts pandas dataframe to the list of dictionaries (records).
Parameters
----------
pd.DataFrame
This is the pandas dataframe consisted of all data from dictionaries converted into respective rows.
Returns
-------
List[Dict[str, Any]]
This list contains the dictionaries inside as elements. Example - [{'primary_title' : "this is first title"},
{'primary_title' : "this is second title"}, {'primary_title' : "this is third title"}]
"""
list_of_dicts = dataframe.to_dict('records')
return list_of_dicts
[docs]def extract_pandas_df_column1_row_values_based_on_column2_value(pandas_dataframe, column2_value,
column2name="source_name", column1name="article_name"):
"""extract the values of pandas dataframe column1's row_values based on values of column2 value
Parameters
----------
pandas_dataframe : pd.DataFrame
This is the pandas dataframe containing at least two columns with values.
column2_value : object
This should be str in normal cases but can be any object type supported in pandas for column value.
column2name : str
This is the name of the column by which we are extracting the column1 values.
column1name : str
This is the name of the column whose values we require.
Returns
-------
list
This is the list of the resultant values from column1 rows.
"""
pandas_dataframe = pandas_dataframe.loc[pandas_dataframe[column2name] == column2_value]
dataframe_dict = pandas_dataframe.to_dict('records')
article_name_list = []
for i in dataframe_dict:
article_name_list.append(i[column1name])
print(article_name_list)
return article_name_list
[docs]def apply_custom_function_on_dataframe_column(dataframe: pd.DataFrame, column_name: str, custom_function,
new_column_name: str = None, *args, **kwargs) -> pd.DataFrame:
"""This apply custom_text_manipulation_function function to all element of dataframe column.
Parameters
----------
new_column_name : str
This is the new name you want to give your modified column and new column will be added to dataframe without
modifying original column.
dataframe : pd.DataFrame
This is the pandas dataframe consisting of column name with elements capable to be transformed with
custom_text_manipulation_function function.
column_name : str
name of dataframe column whose elements are needed to be transformed
custom_function
This is custom_text_manipulation_function function to be applied on each elements of the pandas column elements.
Returns
-------
pd.DataFrame
This is transformed dataframe.
"""
if new_column_name:
dataframe[new_column_name] = dataframe[column_name].apply(lambda x: custom_function(x, *args, **kwargs))
else:
dataframe[column_name] = dataframe[column_name].apply(lambda x: custom_function(x, *args, **kwargs))
return dataframe
[docs]def add_preprocess_column(dataframe_object: pd.DataFrame, column_name: str = "title", ):
"""Takes dataframe and column name to apply preprocess function from string_manipulation module.
Parameters
----------
dataframe_object : pandas.DataFrame object
This is object with column containing column which needs to be preprocessed.
column_name : str
This is the name of the column of dataframe.
Returns
-------
pandas.DataFrame object
DataFrame with additional column with preprocessed column.
"""
new_column_name = "cleaned_" + column_name
dataframe_object = apply_custom_function_on_dataframe_column(
dataframe_object, column_name, string_manipulation.text_manipulation_methods,
text_manipulation_method_name="preprocess_string_to_space_separated_words", new_column_name=new_column_name)
return dataframe_object
[docs]def dataframe_column_counts(dataframe, column_name):
"""Equivalent to pandas.DataFrame.value_counts(), It return list with count of unique element in column
Parameters
----------
dataframe : pd.DataFrame
dataframe which contains column that is to be counted
column_name : str
Name of pandas column elements are supposed to be counted.
Returns
-------
object
unique column elements with counts
"""
return dataframe[column_name].value_counts()
[docs]def try_convert_dataframe_column_elements_to_list(dataframe: pd.DataFrame, column_name: str) -> List[list]:
"""try statement for converting each element of dataframe column to list object.
Parameters
----------
dataframe : pd.DataFrame
The dataframe with column to convert into list
column_name : str
Name of column for conversion
Returns
-------
List[list]
This is list with each element of type list.
"""
keyword_list_of_list = []
for keyword_list in dataframe[column_name]:
try:
keyword_list_of_list.append(list(keyword_list))
except TypeError:
print(f"'{keyword_list}' can not be converted to list")
return keyword_list_of_list
[docs]def dict_key_value_to_records(dictionary: dict, key_column_name: str, value_column_name: str):
"""converts {'key':value, key1: value1},etc to record = [{'key_column_name': key, value_column_name: value}, etc].
that is used to convert to pd.DataFrame
Parameters
----------
dictionary : dict
hash map or dictionary that contains key and value pairs.
key_column_name : str
name of records column
value_column_name : str
name of records column
Returns
-------
list
This list is in records format.
"""
keywords_list_of_dicts = []
for key, value in dictionary.items():
keywords_list_of_dicts.append({key_column_name: key, value_column_name: value})
return keywords_list_of_dicts
[docs]def unpack_list_of_lists_with_optional_apply_custom_function(list_of_lists: List[list], custom_function=None) -> list:
"""unpack lists inside of list to new list containing all the elements from list_of_lists with optional
custom_function applied on all elements. example- [[1,2,3], [3,4,5]] to [1,2,3,3,4,5]
Parameters
----------
list_of_lists : List[list]
This list contains lists as elements which might contains other elements.
custom_function
This is optional function to be applied on each element of list_of_lists
Returns
-------
list
list containing all the elements with any optional transformation using custom_function.
"""
unpacked_list = []
for element_list in list_of_lists:
if custom_function:
temp_list = [custom_function(i) for i in element_list]
unpacked_list.extend(temp_list)
else:
unpacked_list.extend(element_list)
return unpacked_list
[docs]def unpack_list_of_lists(list_of_lists):
"""unpack list consisting of other list to output list which will include all elements from other lists.
Parameters
----------
list_of_lists : list
this is list consisting of elements and lists. example ["first_element", ["second_element"]]
Returns
-------
list
This is the resultant list consisting of only elements. example ["first_element", "second_element"]
"""
unpacked_list = []
for element in list_of_lists:
if type(element) is list:
unpacked_list += element
else:
unpacked_list.append(element)
return unpacked_list
[docs]def records_list_to_dataframe(list_of_dicts: List[Dict[str, Any]]) -> pd.DataFrame:
"""converts the list of dictionaries to pandas dataframe.
Parameters
----------
list_of_dicts : List[Dict[str, Any]]
This list contains the dictionaries inside as elements. Example - [{'primary_title' : "this is the title"}]
Returns
-------
pd.DataFrame
This is the pandas dataframe consisted of all data from dictionaries converted into respective rows.
"""
dataframe = pd.DataFrame.from_records(list_of_dicts)
return dataframe
[docs]def ris_file_to_records_list(ris_file_path: str) -> List[Dict[str, Any]]:
"""Converts .ris file to list of dictionaries of citations using rispy(https://pypi.org/project/rispy/).
For more lemma_info on ris format, visit: https://en.wikipedia.org/wiki/RIS_(file_format)
Parameters
----------
ris_file_path : str
This is the filepath of the ris file.
Returns
-------
List[Dict[str, Any]]
This list contains dictionaries of citations in records format, same as in pandas.
"""
with open(ris_file_path, 'r') as bibliography_file:
ris_list_of_dict = rispy.load(bibliography_file)
source_name = os_utils.get_filename_from_path(ris_file_path)
for dictionary in ris_list_of_dict:
dictionary["source"] = source_name
return ris_list_of_dict
[docs]def ris_file_to_pandas_dataframe(ris_file_path: str) -> pd.DataFrame:
"""
This needs 'rispy' to read ris to list of dicts. It then convert list of dicts to pandas.DataFrame
Parameters
----------
ris_file_path : str
This is the path of ris citations file
Returns
-------
pd.DataFrame
dataframe object from pandas
"""
with open(ris_file_path, 'r') as bibliography_file:
entries = rispy.load(bibliography_file)
df = records_list_to_dataframe(entries)
return df
[docs]def load_multiple_ris_citations_files(citations_files_parent_folder_path: str) -> List[dict]:
"""This function loads all ris citations files from folder
Parameters
----------
citations_files_parent_folder_path : str
this is the path of parent folder of where citations files exists.
Returns
-------
List[dict]
this is list of citations dicts inclusive of all citation files.
"""
citations_path_lists = os_utils.extract_files_path_from_directories_or_subdirectories(
citations_files_parent_folder_path)
citations_list = []
for path in citations_path_lists:
if path.endswith(".ris"):
citations_list += ris_file_to_records_list(path)
return citations_list
[docs]def load_multiple_ris_citations_files_to_dataframe(citations_files_parent_folder_path: str) -> pd.DataFrame:
"""This function loads all ris citations files from folder
Parameters
----------
citations_files_parent_folder_path : str
this is the path of parent folder of where citations files exists.
Returns
-------
pd.DataFrame
this is dataframe of citations dicts inclusive of all citation files.
"""
full_list = load_multiple_ris_citations_files(citations_files_parent_folder_path)
full_list_df = records_list_to_dataframe(full_list)
return full_list_df
[docs]def list_to_text_file(filename: str, list_name: str, permission: str = "w"):
"""This converts list to text file and put each element in new line.
Parameters
----------
filename : str
This is the name to be given for text file.
list_name : list
This is the python data structure list which contains some data.
permission : str
These are the os permissions given for the file. check more lemma_info on python library 'os'.
Returns
-------
None
"""
with open(filename, permission) as file:
for i in list_name:
file.write(str(i))
file.write("\n")
[docs]def list_to_string(list_name):
"""This converts list to text_string and put each element in new line.
Parameters
----------
list_name : list
This is the python data structure list which contains some data.
Returns
-------
str
This is the text string comprises of all data of list.
"""
text_string = ""
for item in list_name:
text_string += str(item)
text_string += "\n"
return text_string
[docs]def dict_values_data_type(dictionary):
"""This provide the data type of dictionary values by outputting dictionary.
Parameters
----------
dictionary : dict
This is the dictionary which contains different types of object in values. Example - {"first": [2, 5], "sec": 3}
Returns
-------
dict
This will output {"<class 'list'>": ["first"], "<class 'int'>": ["sec"]}
"""
dictionary_info = defaultdict(list)
for key, value in dictionary.items():
dictionary_info[str(type(value))].append(key)
return dictionary_info
[docs]def text_file_to_list(file_path: str, permission: str = "r"):
"""This converts text file to list and put each line in list as single element. get first line of text file by
list[0].
Parameters
----------
file_path : str
This is the name to be given for text file.
permission : str
These are the os permissions given for the file. check more lemma_info on python library 'os'.
Returns
-------
list
This contains all lines loaded into list with one line per list element. [first line, second line,.... ]
"""
with open(file_path, permission) as file:
file_object = file.read()
return file_object.split("\n")
[docs]def load_text_file(file_path: str, permission: str = "r"):
"""This reads text file. get all line of text file by file object. for more info visit-
https://docs.python.org/3/tutorial/inputoutput.html
Parameters
----------
file_path : str
This is the path or name of text file.
permission : str
These are the os permissions given for the file.
Returns
-------
file object
This contains all lines loaded.
"""
with open(file_path, permission) as file:
file_object = file.read()
return file_object
[docs]def remove_empty_lines(input_file_path: str, output_filename: str = "output_file.ris") -> None:
"""
This function removes the blank lines from the input file and output new file.
Parameters
----------
input_file_path : str
this is the path of input file
output_filename : str
this is the name of the output ris file with extension.
Returns
-------
None
"""
input_file = open(input_file_path, "r")
output_file = open(output_filename, "a")
for line in input_file:
if line != "\n":
output_file.write(line)
input_file.close()
output_file.close()
[docs]def write_json_file_with_dict(output_file_path: str, input_dict: dict) -> None:
"""Write json file at output_file_path with the help of input dictionary.
Parameters
----------
output_file_path : str
This is the path of output file we want, if only name is provided then it will export json to the script path.
input_dict : dict
This is the python dictionary which we want to be saved in json file format.
Returns
-------
None
Function doesn't return anything but write a json file at output_file_path.
"""
with open(output_file_path, "w") as outfile:
json.dump(input_dict, outfile)
[docs]def json_file_to_dict(json_file_path: str) -> dict:
"""Read the json file from the path given. Convert json file data to the python dictionary.
Parameters
----------
json_file_path : str
This is the json file path which is needed to be converted.
Returns
-------
dict
This is the data in dict format converted from json file.
"""
with open(json_file_path, 'r') as openfile:
# Reading from json file
json_object = json.load(openfile)
return json_object
[docs]def get_text_from_pdf_pdftotext(pdf_file_path: str, pages: str = "all") -> str:
"""Extract the text from pdf file via pdftotext. for more lemma_info, visit: https://pypi.org/project/pdftotext/
Parameters
----------
pdf_file_path : str
This is the path of the pdf file.
pages : str
This could be 'all' to get full text of pdf and 'first' for first page of pdf.
Returns
-------
str
This is the required text from pdf file.
"""
pdf_object = get_pdf_object_from_pdf_path(pdf_file_path)
if pages == "first":
text = pdf_object[0]
elif pages == "all":
text = ""
for pages in pdf_object:
text += pages
else:
text = pdf_object[pages]
return text
[docs]def get_pdf_object_from_pdf_path(pdf_file_path: str):
"""Extract text as pdf object from the pdf file where loop and indexing can show text per pages.
Parameters
----------
pdf_file_path : str
This is the path of pdf file.
Returns
-------
This is pdf object with Extracted text.
"""
try:
import pdftotext
except ImportError:
print("""This function requires pdftotext library to read pdfs.
step 1. install OS Dependencies:
These instructions assume you're using Python 3 on a recent OS.
- Debian, Ubuntu, and friends
sudo apt install build-essential libpoppler-cpp-dev pkg-config python3-dev
- Fedora, Red Hat, and friends
sudo yum install gcc-c++ pkgconfig poppler-cpp-devel python3-devel
- macOS
brew install pkg-config poppler python
- Windows (Install poppler through conda)
conda install -c conda-forge poppler
step 2. Install pdftotext
pip install pdftotext
for more info, please visit https://pypi.org/project/pdftotext/""")
return ""
with open(pdf_file_path, "rb") as pdf_file:
pdf_text_object = pdftotext.PDF(pdf_file)
return pdf_text_object
[docs]def get_text_from_pdf_pymupdf(pdf_file_path: str, pages: str = 'all') -> str:
"""Extract the text from pdf file via fitz(PyMuPDF). for more lemma_info, visit: https://pypi.org/project/PyMuPDF/
Parameters
----------
pages : str
This could be 'all' to get full text of pdf and 'first' for first page of pdf.
pdf_file_path : str
This is the path of pdf file.
Returns
-------
str
This is the required text from pdf file.
"""
try:
import fitz
except ImportError:
print("""This function requires pymupdf library to read pdfs.
Install pymupdf using:
python -m pip install --upgrade pip
python -m pip install --upgrade pymupdf
for more info, please visit https://pypi.org/project/PyMuPDF/""")
return ""
with fitz.open(pdf_file_path) as doc:
text = ""
if pages == "first":
for page in doc:
text += page.get_text()
return text
elif pages == "all":
for page in doc:
text += page.get_text()
return text
[docs]def get_text_from_pdf(pdf_file_path: str, pages: str = 'all', pdf_reader: str = 'pdftotext') -> Union[str, bool]:
"""This Function get text from pdf files using either pdftotext or pymupdf.
Parameters
----------
pdf_reader : str
This is python pdf reader package which convert pdf to text.
pdf_file_path : str
This is the path of pdf file.
pages : str
This could be 'all' to get full text of pdf and 'first' for first page of pdf.
Returns
-------
str
This is the required text from pdf file.
"""
try:
if pdf_reader == 'pdftotext':
pdf_text = get_text_from_pdf_pdftotext(pdf_file_path, pages)
return pdf_text
elif pdf_reader == 'pymupdf':
pdf_text = get_text_from_pdf_pymupdf(pdf_file_path, pages)
return pdf_text
else:
print("Not Implemented")
except Exception:
return ""
[docs]def get_text_from_multiple_pdf_reader(pdf_file_path: str, pages: str = 'all') -> Union[str, bool]:
"""This Function get text from pdf files using pdftotext. if failed then text comes from pymupdf.
Parameters
----------
pdf_file_path : str
This is the path of pdf file.
pages : str
This could be 'all' to get full text of pdf and 'first' for first page of pdf.
Returns
-------
str
This is the required text from pdf file.
"""
pdf_text = ""
try:
pdf_text = get_text_from_pdf_pdftotext(pdf_file_path, pages)
except Exception:
pass
if pdf_text == "":
try:
pdf_text = get_text_from_pdf_pymupdf(pdf_file_path, pages)
except Exception:
pass
return pdf_text
[docs]class ASReview:
def __init__(self, data: Union[List[dict], pd.DataFrame]):
"""This class export citation files using dataframe or records list in csv file format.
Parameters
----------
data :
data could be List[dict] and pd.DataFrame
"""
self.data = data
[docs] def get_file(self, output_filename: str = "output.csv", index: bool = True):
"""Outputs the file needed to start project in ASReview.
Parameters
----------
output_filename : str
name or path of your needed file.
index : bool
asks if you need index column in output file.
Returns
-------
"""
if type(self.data) == pd.DataFrame:
dataframe = self.data.copy()
dataframe['label_included'] = ""
dataframe_to_csv_file(dataframe, output_filename, index)
elif type(self.data) == list:
df = records_list_to_dataframe(self.data)
dataframe = df.copy()
dataframe['label_included'] = ""
dataframe_to_csv_file(dataframe, output_filename, index)
else:
raise NotImplementedError(f"data type {type(self.data)} not Implemented, Use List[dict] and pd.DataFrame.")
[docs]class Reader:
"""Contains functionality to read files.
"""
def __init__(self, file_path: str):
"""Needs file path to read a file.
Parameters
----------
file_path : str
path of the file.
"""
self.file_path = file_path
self.file_extension = os_utils.get_file_extension_from_path(self.file_path)
[docs] def get_text(self, pages: str = 'all'):
"""It understand the type of file and output the content of file.
Parameters
----------
pages : str
contain option to read 'first' or 'all' pages.
Returns
-------
str
This is text in readable file.
"""
if self.file_extension == "pdf":
return get_text_from_multiple_pdf_reader(self.file_path, pages)
elif self.file_extension == "csv":
return self.pandas_reader("read_csv")
elif self.file_extension[0] == "x":
return self.pandas_reader("read_excel")
elif self.file_extension == "json":
return json_file_to_dict(self.file_path)
else:
return load_text_file(self.file_path)
[docs] def pdf_pdftotext_reader(self, pages: str = 'all'):
"""Extract the text from pdf file via pdftotext. for more lemma_info, visit: https://pypi.org/project/pdftotext/
Parameters
----------
pages : str
This could be 'all' to get full text of pdf and 'first' for first page of pdf.
Returns
-------
str
This is the required text from pdf file.
"""
try:
pdf_text = get_text_from_pdf_pdftotext(self.file_path, pages)
except Exception:
pdf_text = ""
return pdf_text
[docs] def pdf_pymupdf_reader(self, pages: str = 'all'):
"""Extract the text from pdf file via fitz(PyMuPDF). for more lemma_info, visit: https://pypi.org/project/PyMuPDF/
Parameters
----------
pages : str
This could be 'all' to get full text of pdf and 'first' for first page of pdf.
Returns
-------
str
This is the required text from pdf file.
"""
try:
pdf_text = get_text_from_pdf_pymupdf(self.file_path, pages)
except Exception:
pdf_text = ""
return pdf_text
[docs] def pandas_reader(self, input_file_type):
"""Read file using pandas IO https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html
Parameters
----------
input_file_type : str
check pandas IO for examples like read_csv, read_excel etc.
Returns
-------
str
This is the required text from pandas IO.
"""
dataframe = getattr(pd, input_file_type)(self.file_path)
return dataframe