"""Module: analysis
This module contain code for generating info, diagrams and tables. It can be used to generate systematic review flow
and citations information.
"""
from typing import List, Union, Any
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from systematic_review import os_utils, converter, citation, string_manipulation, validation
[docs]def creating_sample_review_file(selected_citation_df):
"""This function outputs dataframe with including columns to make literature review easier.
Parameters
----------
selected_citation_df : pandas.DataFrame object
This dataframe is the result of last step of systematic-reviewpy. This contains records for manual literature
review.
Returns
-------
pandas.DataFrame object
This is dataframe with additional columns for helping in adding details of literature review.
"""
# add additional columns
literature_review_cols = ['Main Topic', 'Sub Topic', 'source', 'Aim of the Study(objectives)', 'data sources',
'Data period', 'Input Variables', 'methodology', 'Findings', 'Research Gap/ Limitations',
'Results / Conclusions', 'place_published', 'Notes, Special Considerations', 'Email ID']
selected_citation_review = pd.concat([selected_citation_df, pd.DataFrame(columns=literature_review_cols)])
return selected_citation_review
[docs]def analysis_of_multiple_ris_citations_files(citations_files_parent_folder_path: str) -> dict:
"""This function loads all ris citations files from folder and return the databases names and collected number of
citations from the databases to dict.
Parameters
----------
citations_files_parent_folder_path : str
this is the path of parent folder of where citations files exists.
Returns
-------
dict
this is dict of databases name and number of records in ris files.
"""
citations_path_lists = os_utils.extract_files_path_from_directories_or_subdirectories(
citations_files_parent_folder_path)
details = {"total": 0}
for path in citations_path_lists:
if path.endswith(".ris"):
length = len(converter.ris_file_to_records_list(path))
details[os_utils.get_filename_from_path(path)] = length
details["total"] += length
return details
[docs]def vertical_dict_view(dictionary: dict) -> str:
"""convert dict to string with each element in new line.
Parameters
----------
dictionary : dict
Contains key and value which we want to print vertically.
Returns
-------
str
This prints key1 : value1
and key2 : value2 ... in vertical format
"""
output_string = ""
for key, value in dictionary.items():
output_string += f"{key} : {value}\n"
return output_string
[docs]def duplicate_count(dataframe: pd.DataFrame) -> int:
"""return count of the duplicate articles.
Parameters
----------
dataframe : pd.DataFrame
Input pandas dataframe where we want to check numbers of duplicates.
Returns
-------
int
number of duplicates records.
"""
complete_citations_df = citation.drop_duplicates_citations(dataframe)
count_of_duplicates = len(dataframe) - len(complete_citations_df)
return count_of_duplicates
[docs]def missed_article_count(filter_sorted_citations_df: pd.DataFrame, downloaded_articles_path: str,
title_column_name: str = "cleaned_title"):
"""return count of missed articles from downloading by checking original list of articles from
filter_sorted_citations_df using downloaded articles path.
Parameters
----------
title_column_name : str
contains name of column which contain the name of article.
filter_sorted_citations_df : pd.DataFrame
This dataframe contains records of selected articles including name of articles.
downloaded_articles_path : str
contains parent folder of all the downloaded articles files.
Returns
-------
int
count of the missed articles from downloading.
"""
original_list = [i for i in filter_sorted_citations_df[title_column_name]]
validated_articles_list, invalidated_list, manual_list = validation.validating_pdfs_using_multiple_pdf_reader(
downloaded_articles_path)
articles_list = validation.getting_article_paths_from_validation_detail(validated_articles_list)
downloaded_list = [
string_manipulation.preprocess_string(os_utils.get_filename_from_path(k))
for k in articles_list]
missed_articles = validation.finding_missed_articles_from_downloading(downloaded_list, original_list)
return len(missed_articles[0])
[docs]def text_padding_for_visualise(text: str, front_padding_space_multiple: int = 4,
top_bottom_line_padding_multiple: int = 1):
"""This add required space on all four side of text for better look.
Parameters
----------
text : str
This is the input word.
front_padding_space_multiple : int
This multiply the left and right side of spaces for increased padding.
top_bottom_line_padding_multiple : int
This multiply the top and down side of spaces for increased padding.
Returns
-------
tuple
str - text with spaces on all four sides.
int - height that is number of lines.
int - width that is number of char in longest line.
"""
top_bottom_line_padding = "\n" * top_bottom_line_padding_multiple
output_text = top_bottom_line_padding
height = top_bottom_line_padding_multiple * 2
width = front_padding_space_multiple * 2
max_width = 0
for t in text.split("\n"):
padding = " " * front_padding_space_multiple + t + " " * front_padding_space_multiple
output_text += padding + "\n"
max_width = max(len(t), max_width)
width += max_width
height += len(text.split("\n"))
output_text += "\n" * (top_bottom_line_padding_multiple - 1)
return output_text, height, width
[docs]def custom_box(**kwargs) -> dict:
"""This is the option for matplotlib text in box.
Parameters
----------
kwargs : dict
Contains key word arguments
Returns
-------
dict
contains options
"""
# "boxstyle": "square,pad=10" for extra padding use pad=amount.
custom_options = {"bbox": {"boxstyle": "square", "facecolor": "white"}, "horizontalalignment": "center",
"verticalalignment": "center", "color": "midnightblue"}
if kwargs:
for key, value in kwargs.items():
custom_options[key] = value
return custom_options
[docs]class TextInBox:
"""This is matplotlib text in box class to make it easier to use text boxes.
"""
def __init__(self, figure_axes, x_coordinate, y_coordinate, text=""):
"""It needs pyplot figure axes to add boxes, and x and y coordinate with any text to put into box.
Parameters
----------
figure_axes : matplotlib.pyplot.axes
This is the axes of the figure where we want to add text box.
x_coordinate : float
This is the x coordinate usually 0 at left bottom side of figure in this module.
y_coordinate : float
This is the y coordinate usually 0 at left bottom side of figure in this module.
text : str
This is text to be written inside of box.
"""
self.figure_axes = figure_axes
self.x_coordinate = x_coordinate
self.y_coordinate = y_coordinate
self.width_of_one_char = 0.067
self.width_of_one_line = 0.165
self.text = text_padding_for_visualise(text)[0]
self.left = (
self.x_coordinate - ((text_padding_for_visualise(text)[2] / 2) * self.width_of_one_char), self.y_coordinate)
self.right = (
self.x_coordinate + ((text_padding_for_visualise(text)[2] / 2) * self.width_of_one_char), self.y_coordinate)
self.top = (
self.x_coordinate, self.y_coordinate + ((text_padding_for_visualise(text)[1] / 2) * self.width_of_one_line))
self.bottom = (
self.x_coordinate, self.y_coordinate - ((text_padding_for_visualise(text)[1] / 2) * self.width_of_one_line))
[docs] def add_box(self, **kwargs: Union[dict, str, Any]):
"""It put the box on the matplotlib.pyplot.axes figure
Parameters
----------
kwargs : Union[dict, str, Any]
This taken any custom_text_manipulation_function options to be set into box.
Returns
-------
"""
self.figure_axes.text(self.x_coordinate, self.y_coordinate, self.text, custom_box(**kwargs))
[docs]class Annotate:
"""This class makes it easier to draw arrows into matplotlib.pyplot.axes figure
"""
def __init__(self, figure_axes, start_coordinate, end_coordinate, arrow_style="<|-"):
"""This takes matplotlib.pyplot.axes and location of x and y coordinate for both start and end point. end point
is the arrow head target.
Parameters
----------
figure_axes : matplotlib.pyplot.axes
This is the axes of the figure where we want to add text box.
start_coordinate : tuple
this is tuple containing x and y coordinates of the point, 0, 0 is left bottom in this module figure. start
point is the arrow handle.
end_coordinate : tuple
this is tuple containing x and y coordinates of the point, 0, 0 is left bottom in this module figure. end
point is the arrow head target.
arrow_style : str
This contains symbol for different type of arrows in matplotlib.
"""
self.figure_axes = figure_axes
self.start_coordinate = start_coordinate
self.end_coordinate = end_coordinate
self.arrowstyle = arrow_style
[docs] def add_arrow(self, text=""):
"""This draw the arrow on matplotlib.pyplot.axes.
Parameters
----------
text : str
This takes test to put on the arrow.
Returns
-------
"""
self.figure_axes.annotate(
text,
self.start_coordinate,
self.end_coordinate,
arrowprops=dict(arrowstyle=self.arrowstyle))
[docs]class SystematicReviewInfo:
"""This analyse whole systematic review process and takes all produced file to generate tables, figure.
"""
download_flag_column_name = 'downloaded'
file_validated_flag_name = "yes"
def __init__(self, citations_files_parent_folder_path: str = None, filter_sorted_citations_df: pd.DataFrame = None,
validated_research_papers_df: pd.DataFrame = None, selected_research_papers_df: pd.DataFrame = None):
"""This class contains all necessary information for systematic review flow.
Parameters
----------
citations_files_parent_folder_path : str
this is the path of parent folder of where citations files exists.
filter_sorted_citations_df : pd.DataFrame
This is screened dataframe containing records for downloading full text.
selected_research_papers_df : pd.DataFrame
This dataframe contains records for manual literature review.
validated_research_papers_df : pd.DataFrame
This contains validation of downloaded research articles.
"""
self.citations_files_parent_folder_path = citations_files_parent_folder_path if \
citations_files_parent_folder_path is not None else ""
self.sources = analysis_of_multiple_ris_citations_files(citations_files_parent_folder_path) if \
citations_files_parent_folder_path is not None else ""
self.duplicates = duplicate_count(
converter.load_multiple_ris_citations_files_to_dataframe(citations_files_parent_folder_path)) if \
citations_files_parent_folder_path is not None else ""
self.screened = int(self.sources["total"]) - int(self.duplicates) if (self.sources is not None) and (
self.duplicates is not None) else ""
self.for_retrieval = len(filter_sorted_citations_df) if filter_sorted_citations_df is not None else ""
self.screened_out = self.screened - self.for_retrieval if (self.screened is not None) and (
self.for_retrieval is not None) else ""
self.not_retrieved = len(validated_research_papers_df) - len(
validated_research_papers_df.loc[validated_research_papers_df[
self.download_flag_column_name] == self.file_validated_flag_name]) if \
(validated_research_papers_df is not None) and \
(self.download_flag_column_name in validated_research_papers_df.columns) else ""
self.eligible = len(selected_research_papers_df) if selected_research_papers_df is not None else ""
self.manually_excluded = ""
self.manually_excluded_reasons = ""
self.included = ""
[docs] def get_text_list(self) -> List[str]:
"""This produces the list of all analysis done in this class.
Returns
-------
List[str]
This contains systematic review information in sentences.
"""
text_list = [f"Records identified from -\n{vertical_dict_view(self.sources)}",
f"Records screened\n(n = {self.screened})",
f"Reports sought for retrieval\n(n = {self.for_retrieval})",
f"Reports assessed for eligibility\n(n = {self.eligible})",
f"Total studies included in review\n(n = {self.included})",
f"Records removed before screening -\nDuplicate records removed\n (n = {self.duplicates})",
f"Records screened out\n(n = {self.screened_out})",
f"Reports not retrieved -\n{self.download_flag_column_name}\n(n = {self.not_retrieved})",
f"Reports excluded\n{self.manually_excluded}\n{self.manually_excluded_reasons}"]
return text_list
[docs] def info(self):
"""This takes systematic review text list and create proper order to print.
Returns
-------
"""
temp_text = self.get_text_list()
order = [0, 5, 1, 6, 2, 7, 3, 8, 4]
for index in order:
print(temp_text[index], "\n")
[docs] def systematic_review_diagram(self, fig_width=10, fig_height=10, diagram_fname: str = None, color: bool = True,
color_info: bool = True,
auto_fig_size: bool = True,
hide_border: bool = True, **kwargs):
"""This outputs the systematic review diagram resembling PRISMA guidelines.
Parameters
----------
kwargs : dict
kwargs are also given to ``matplotlib.pyplot.savefig(**kwargs)``
hide_border : bool
border is line outside of diagram
auto_fig_size : bool
this sets the figure size automatically based on given data.
color : bool
This is color inside of diagram boxes. turn this off by putting False.
color_info : bool
This show meaning of color in diagram.
diagram_fname : str
filename or path of diagram image to be saved.
fig_width : float
This is width of figure in inches.
fig_height : float
This is height of figure in inches.
Returns
-------
"""
text_list = self.get_text_list()
width_of_one_char = 0.067
width_of_one_line = 0.165
top_spaces = width_of_one_line * 2
top_outer_spaces = width_of_one_line * 4
left_spaces = width_of_one_char * 10
left_outer_spaces = width_of_one_char * 15
height = 0
height += 2 * top_outer_spaces
height += 4 * top_spaces
height += (width_of_one_line * max(text_padding_for_visualise(text_list[0])[1],
text_padding_for_visualise(text_list[5])[1]))
height += (width_of_one_line * max(text_padding_for_visualise(text_list[1])[1],
text_padding_for_visualise(text_list[6])[1]))
height += (width_of_one_line * max(text_padding_for_visualise(text_list[2])[1],
text_padding_for_visualise(text_list[7])[1]))
height += (width_of_one_line * max(text_padding_for_visualise(text_list[3])[1],
text_padding_for_visualise(text_list[8])[1]))
height += (width_of_one_line * text_padding_for_visualise(text_list[4])[1])
width = 0
width += 2 * left_outer_spaces
width += left_spaces
max_left_width = (
width_of_one_char * max(text_padding_for_visualise(text_list[0])[2],
text_padding_for_visualise(text_list[1])[2],
text_padding_for_visualise(text_list[2])[2],
text_padding_for_visualise(text_list[3])[2],
text_padding_for_visualise(text_list[3])[2]))
max_right_width = (
width_of_one_char * max(text_padding_for_visualise(text_list[5])[2],
text_padding_for_visualise(text_list[6])[2],
text_padding_for_visualise(text_list[7])[2],
text_padding_for_visualise(text_list[8])[2]))
width += (max_left_width + max_right_width)
if auto_fig_size:
fig_width = width
fig_height = height
fig = plt.figure(figsize=(fig_width, fig_height))
ax = fig.add_axes((0, 0, 1, 1))
ax.set_xlim(0, fig_width)
ax.set_ylim(0, fig_height)
ax.tick_params(bottom=False, top=False,
left=False, right=False)
ax.tick_params(labelbottom=False, labeltop=False,
labelleft=False, labelright=False)
x_position_left = left_outer_spaces + (max_left_width / 2)
x_position_right = left_outer_spaces + max_left_width + left_spaces + (max_right_width / 2)
# draw rectangles with text in the center
all_boxes = []
# box 0 to 4
all_boxes.append(TextInBox(ax,
x_position_left,
(fig_height - top_outer_spaces - (width_of_one_line * max(
(text_padding_for_visualise(text_list[0])[1] / 2),
(text_padding_for_visualise(text_list[5])[1] / 2)))
),
text_list[0]
)
)
all_boxes.append(TextInBox(ax,
x_position_left,
(all_boxes[0].bottom[1] - top_spaces - (width_of_one_line * max(
(text_padding_for_visualise(text_list[1])[1] / 2),
(text_padding_for_visualise(text_list[6])[1] / 2)))
),
text_list[1]
)
)
all_boxes.append(TextInBox(ax,
x_position_left,
(all_boxes[1].bottom[1] - top_spaces - (width_of_one_line * max(
(text_padding_for_visualise(text_list[2])[1] / 2),
(text_padding_for_visualise(text_list[7])[1] / 2)))
),
text_list[2]
)
)
all_boxes.append(TextInBox(ax,
x_position_left,
(all_boxes[2].bottom[1] - top_spaces - (width_of_one_line * max(
(text_padding_for_visualise(text_list[3])[1] / 2),
(text_padding_for_visualise(text_list[8])[1] / 2)))
),
text_list[3]
)
)
all_boxes.append(TextInBox(ax,
x_position_left,
(all_boxes[3].bottom[1] - top_spaces - (
width_of_one_line * text_padding_for_visualise(text_list[4])[1] / 2)),
text_list[4]
)
)
# box 5 to 8
all_boxes.append(TextInBox(ax,
x_position_right,
(fig_height - top_outer_spaces - (width_of_one_line * max(
(text_padding_for_visualise(text_list[0])[1] / 2),
(text_padding_for_visualise(text_list[5])[1] / 2)))
),
text_list[5]
)
)
all_boxes.append(TextInBox(ax,
x_position_right,
(all_boxes[0].bottom[1] - top_spaces - (width_of_one_line * max(
(text_padding_for_visualise(text_list[1])[1] / 2),
(text_padding_for_visualise(text_list[6])[1] / 2)))
),
text_list[6]
)
)
all_boxes.append(TextInBox(ax,
x_position_right,
(all_boxes[1].bottom[1] - top_spaces - (width_of_one_line * max(
(text_padding_for_visualise(text_list[2])[1] / 2),
(text_padding_for_visualise(text_list[7])[1] / 2)))
),
text_list[7]
)
)
all_boxes.append(TextInBox(ax,
x_position_right,
(all_boxes[2].bottom[1] - top_spaces - (width_of_one_line * max(
(text_padding_for_visualise(text_list[3])[1] / 2),
(text_padding_for_visualise(text_list[8])[1] / 2)))
),
text_list[8]
)
)
if color:
for all_boxes_index in range(len(all_boxes)):
if all_boxes_index == 4:
all_boxes[all_boxes_index].add_box(
bbox={"boxstyle": "square", "facecolor": 'red', "alpha": 0.3, "edgecolor": "red"},
color="black")
else:
all_boxes[all_boxes_index].add_box(
bbox={"boxstyle": "square", "facecolor": 'lightgreen', "alpha": 0.3, "edgecolor": "green"},
color="black")
else:
for box in all_boxes:
box.add_box()
# Draw arrows
all_arrows = [Annotate(ax, all_boxes[0].bottom, all_boxes[1].top),
Annotate(ax, all_boxes[1].bottom, all_boxes[2].top),
Annotate(ax, all_boxes[2].bottom, all_boxes[3].top),
Annotate(ax, all_boxes[3].bottom, all_boxes[4].top),
Annotate(ax, all_boxes[0].right, all_boxes[5].left),
Annotate(ax, all_boxes[1].right, all_boxes[6].left),
Annotate(ax, all_boxes[2].right, all_boxes[7].left),
Annotate(ax, all_boxes[3].right, all_boxes[8].left)]
# vertical arrows 0-1 to 3-4
# Horizontal arrows 0-5 to 3-8
for arrow in all_arrows:
arrow.add_arrow()
if color_info:
ax.text(x_position_right,
(all_boxes[8].bottom[1] + width_of_one_line - top_spaces - (width_of_one_line * (
text_padding_for_visualise(text_list[4])[1] / 2))
), "Automated step")
ax.text(x_position_right,
(all_boxes[8].bottom[1] - width_of_one_line - top_spaces - (width_of_one_line * (
text_padding_for_visualise(text_list[4])[1] / 2))
), "Manual step")
# Create a Rectangle box
import matplotlib.patches as patches
green_rect = patches.Rectangle((x_position_right - (width_of_one_char * 3),
(all_boxes[8].bottom[1] + width_of_one_line
- top_spaces
- (width_of_one_line * (
text_padding_for_visualise(text_list[4])[1] / 2)))),
.1, .1, linewidth=1, facecolor='green', alpha=0.6)
red_rect = patches.Rectangle((x_position_right - (width_of_one_char * 3),
(all_boxes[8].bottom[1] - width_of_one_line - top_spaces
- (width_of_one_line * (text_padding_for_visualise(text_list[4])[1] / 2)))),
.1, .1, linewidth=1, facecolor='red', alpha=0.5)
# Add the rectangular box patch to the Axes
ax.add_patch(green_rect)
ax.add_patch(red_rect)
# makes border invisible
if hide_border:
ax.axis('off')
if diagram_fname:
plt.savefig(diagram_fname, kwargs)
plt.show()
[docs]def pandas_countplot_with_pandas_dataframe_column(dataframe, column_name, top_result, plot_kind: str = "bar",
diagram_fname: str = None, **kwargs):
"""generate pandas count chart using dataframe column.
Parameters
----------
dataframe : pd.DataFrame
dataframe which contains column whose value counts to be shown.
column_name : str
Name of pandas column elements are supposed to be counted.
top_result : int
This limits the number of column unique elements to be shown
plot_kind : str
pandas plot option of kind of chart needed. defaults to 'bar' in this implementation
diagram_fname : str
filename or path of diagram image to be saved.
kwargs : dict
kwargs are also given to ``matplotlib.pyplot.savefig(**kwargs)``
Returns
-------
"""
dataframe[column_name].value_counts()[:top_result].plot(kind=plot_kind)
if diagram_fname:
plt.savefig(diagram_fname, kwargs)
plt.show()
[docs]def seaborn_countplot_with_pandas_dataframe_column(dataframe, column_name, theme_style="darkgrid",
xaxis_label_rotation=90, top_result=None,
diagram_fname: str = None, **kwargs):
"""generate seaborn count bar chart using dataframe column.
Parameters
----------
diagram_fname : str
filename or path of diagram image to be saved.
dataframe : pd.DataFrame
dataframe which contains column whose value counts to be shown.
column_name : str
Name of pandas column elements are supposed to be counted.
theme_style : str
name of the bar chart theme
xaxis_label_rotation : float
rotate the column elements shown on x axis or horizontally.
top_result : int
This limits the number of column unique elements to be shown
kwargs : dict
kwargs are also given to ``matplotlib.pyplot.savefig(**kwargs)``
Returns
-------
object
show the bar chart
"""
ax = sns.countplot(x=column_name, data=dataframe, order=dataframe.value_counts(column_name).iloc[:top_result].index)
sns.set_theme(style=theme_style)
plt.xticks(rotation=xaxis_label_rotation)
ax.bar_label(ax.containers[0])
if diagram_fname:
plt.savefig(diagram_fname, kwargs)
plt.show()
[docs]class CitationAnalysis:
"""This takes any pandas dataframe containing citation details and produces analyses on various columns.
"""
def __init__(self, dataframe):
"""This requires citation dataframe.
Parameters
----------
dataframe : pd.DataFrame
This dataframe is checked for columns for analyses, please change column name for analyses if not same as
implemented.
"""
self.dataframe = dataframe
[docs] def publication_year_info(self, column_name: str = "year"):
"""shows how many articles are published each year.
Parameters
----------
column_name : str
column name of publication year detail in citation dataframe
Returns
-------
object
contains year and count of publications
"""
return converter.dataframe_column_counts(self.dataframe, column_name)
[docs] def publication_year_diagram(self, column_name: str = "year",
top_result=None, method: str = "seaborn", theme_style="darkgrid",
xaxis_label_rotation=90, pandas_bar_kind: str = "bar", diagram_fname: str = None,
**kwargs):
"""generates chart showing how many articles are published each year.
Parameters
----------
pandas_bar_kind : str
pandas plot option of kind of chart needed. defaults to 'bar' in this implementation
column_name : str
column name of publication year detail in citation dataframe
theme_style : str
name of the bar chart theme
xaxis_label_rotation : float
rotate the column elements shown on x axis or horizontally.
top_result : int
This limits the number of column unique elements to be shown
method : str
provide option to plot chart using either 'seaborn' or 'pandas'
diagram_fname : str
filename or path of diagram image to be saved.
kwargs : dict
kwargs are also given to ``matplotlib.pyplot.savefig(**kwargs)``
Returns
-------
"""
if method.lower() == "seaborn":
seaborn_countplot_with_pandas_dataframe_column(self.dataframe, column_name, theme_style,
xaxis_label_rotation, top_result, diagram_fname, **kwargs)
elif method.lower() == "pandas":
pandas_countplot_with_pandas_dataframe_column(self.dataframe, column_name, top_result, pandas_bar_kind,
diagram_fname, **kwargs)
else:
print("Please provide text_manipulation_method_name value as 'seaborn' or 'pandas'.")
[docs] def authors_analysis(self, authors_column_name="authors"):
"""generates the details based on pandas dataframe column of article authors. example- Number of authors,
Articles with single authors, Articles per authors, Authors per articles
Parameters
----------
authors_column_name : str
Name of column containing authors details.
Returns
-------
tuple
contains Number of authors, Articles with single authors, Articles per authors, Authors per articles
"""
number_of_articles = len(self.dataframe)
unique_author_names = set()
articles_with_single_authors = 0
for authors_list in self.dataframe[authors_column_name]:
if len(authors_list) == 1:
articles_with_single_authors += 1
for authors in authors_list:
unique_author_names.add(authors)
number_of_authors = len(unique_author_names)
articles_per_authors = number_of_articles / number_of_authors
authors_per_articles = number_of_authors / number_of_articles
return number_of_authors, articles_with_single_authors, articles_per_authors, authors_per_articles
[docs] def authors_info(self):
"""prints the authors analysis details in nice format
Returns
-------
"""
number_of_authors, articles_with_single_authors, articles_per_authors, authors_per_articles = \
self.authors_analysis()
print(f"Number of authors = {number_of_authors}")
print(f"Articles with single authors = {articles_with_single_authors}")
print(f"Articles per authors = {articles_per_authors}")
print(f"Authors per articles = {authors_per_articles}")
[docs] def publication_place_info(self, column_name: str = "place_published"):
"""shows how many articles are published from different places or countries.
Parameters
----------
column_name : str
column name of publication place detail in citation dataframe
Returns
-------
object
contains publication place and count of publications
"""
return converter.dataframe_column_counts(self.dataframe, column_name)
[docs] def publication_place_diagram(self, column_name: str = "place_published",
top_result=None, method: str = "seaborn", theme_style="darkgrid",
xaxis_label_rotation=90, pandas_bar_kind: str = "bar", diagram_fname: str = None,
**kwargs):
"""generates chart showing how many articles are published from different places or countries.
Parameters
----------
pandas_bar_kind : str
pandas plot option of kind of chart needed. defaults to 'bar' in this implementation
column_name : str
column name of publication place detail in citation dataframe
theme_style : str
name of the bar chart theme
xaxis_label_rotation : float
rotate the column elements shown on x axis or horizontally.
top_result : int
This limits the number of column unique elements to be shown
method : str
provide option to plot chart using either 'seaborn' or 'pandas'
diagram_fname : str
filename or path of diagram image to be saved.
kwargs : dict
kwargs are also given to ``matplotlib.pyplot.savefig(**kwargs)``
Returns
-------
"""
if method.lower() == "seaborn":
seaborn_countplot_with_pandas_dataframe_column(self.dataframe, column_name, theme_style,
xaxis_label_rotation, top_result, diagram_fname, **kwargs)
elif method.lower() == "pandas":
pandas_countplot_with_pandas_dataframe_column(self.dataframe, column_name, top_result, pandas_bar_kind,
diagram_fname, **kwargs)
else:
print("Please provide text_manipulation_method_name value as 'seaborn' or 'pandas'.")
[docs] def publisher_info(self, column_name: str = "publisher"):
"""shows how many articles are published by different publishers.
Parameters
----------
column_name : str
column name of publisher detail in citation dataframe.
Returns
-------
object
contains publisher name and count of publications.
"""
return converter.dataframe_column_counts(self.dataframe, column_name)
[docs] def publisher_diagram(self, column_name: str = "publisher",
top_result=None, method: str = "seaborn", theme_style="darkgrid",
xaxis_label_rotation=90, pandas_bar_kind: str = "bar", diagram_fname: str = None, **kwargs):
"""generates chart showing how many articles are published by different publishers.
Parameters
----------
pandas_bar_kind : str
pandas plot option of kind of chart needed. defaults to 'bar' in this implementation
column_name : str
column name of publisher detail in citation dataframe
theme_style : str
name of the bar chart theme
xaxis_label_rotation : float
rotate the column elements shown on x axis or horizontally.
top_result : int
This limits the number of column unique elements to be shown
method : str
provide option to plot chart using either 'seaborn' or 'pandas'
diagram_fname : str
filename or path of diagram image to be saved.
kwargs : dict
kwargs are also given to ``matplotlib.pyplot.savefig(**kwargs)``
Returns
-------
"""
if method.lower() == "seaborn":
seaborn_countplot_with_pandas_dataframe_column(self.dataframe, column_name, theme_style,
xaxis_label_rotation, top_result, diagram_fname, **kwargs)
elif method.lower() == "pandas":
pandas_countplot_with_pandas_dataframe_column(self.dataframe, column_name, top_result, pandas_bar_kind,
diagram_fname, **kwargs)
else:
print("Please provide text_manipulation_method_name value as 'seaborn' or 'pandas'.")
[docs] def keywords_info(self, column_name: str = "keywords"):
"""return search_words_object and number of times they are used in the articles
Parameters
----------
column_name : str
column name of search_words_object detail in citation dataframe
Returns
-------
"""
return converter.dataframe_column_counts(self.extract_keywords(), column_name)
[docs] def keyword_diagram(self, column_name: str = "keywords",
top_result=None, method: str = "seaborn", theme_style="darkgrid",
xaxis_label_rotation=90, pandas_bar_kind: str = "bar", diagram_fname: str = None, **kwargs):
"""generates chart showing how many articles are published by different publishers.
Parameters
----------
pandas_bar_kind : str
pandas plot option of kind of chart needed. defaults to 'bar' in this implementation
column_name : str
column name of search_words_object detail in citation dataframe
theme_style : str
name of the bar chart theme
xaxis_label_rotation : float
rotate the column elements shown on x axis or horizontally.
top_result : int
This limits the number of column unique elements to be shown
method : str
provide option to plot chart using either 'seaborn' or 'pandas'
diagram_fname : str
filename or path of diagram image to be saved.
kwargs : dict
kwargs are also given to ``matplotlib.pyplot.savefig(**kwargs)``
Returns
-------
"""
if method.lower() == "seaborn":
seaborn_countplot_with_pandas_dataframe_column(self.extract_keywords(), column_name, theme_style,
xaxis_label_rotation, top_result, diagram_fname, **kwargs)
elif method.lower() == "pandas":
pandas_countplot_with_pandas_dataframe_column(self.extract_keywords(), column_name, top_result,
pandas_bar_kind, diagram_fname, **kwargs)
else:
print("Please provide text_manipulation_method_name value as 'seaborn' or 'pandas'.")