"""Module: validation
This module contains functions related validating our downloaded articles if they're same as ones we require. It also
contains functions to get articles source name and create list of missed or duplicate articles.
"""
from difflib import SequenceMatcher
import pandas as pd
from typing import List, Union, Dict, Any
from systematic_review import string_manipulation
from systematic_review import converter
from systematic_review import os_utils
[docs]def get_dataframe_column_as_list(dataframe: pd.DataFrame, column_name: str = 'primary_title'):
"""Get pandas dataframe column values as list.
Parameters
----------
dataframe : pd.DataFrame
This is the dataframe which contains column whose details we want as list.
column_name : str
This is the name of the column.
Returns
-------
list
This is the list containing the dataframe one column values.
"""
column_values_list = dataframe[column_name].to_list()
return column_values_list
[docs]def similarity_sequence_matcher(string_a: str, string_b: str) -> float:
"""Shows the percentage similarity between two strings like 0.9836065573770492 that means 98.35%
Parameters
----------
string_a : str
This is first string
string_b : str
This is second string
Returns
-------
float
This is the result of SequenceMatcher Example 0.9836065573770492 that means 98.35%
"""
return SequenceMatcher(None, string_a, string_b).ratio()
[docs]def calculate_percentage(value: float, total: float) -> float:
"""calculate percentage of value in total.
Parameters
----------
value : float
It is input number, normally smaller than total.
total : float
It is the larger number from which we want to know percentage
Returns
-------
float
This is calculated percentage. Example 98.36065573770492 that means 98.35%
"""
percentage = (value / total) * 100
return percentage
[docs]def amount_by_percentage(number: float, percentage: float) -> float:
"""get the amount of number based on percentage. example- 5% (percentage) of 10 (number) is 0.5 (result).
Parameters
----------
number : float
This is the input number from which we want some percent amount
percentage : float
This is equivalent to math percentage.
Returns
-------
float
This is resultant number.
"""
return number * percentage / 100
[docs]def add_dict_element_with_count(dictionary: dict, key: str) -> dict:
"""It increase the value by checking the key of dictionary or initialise new key with value 1. Works as collections
module default dict with value 1.
Parameters
----------
dictionary : dict
This is the dictionary where we want to add element.
key : str
This is the key of dictionary {key: value}
Returns
-------
dict
This is the edited dict with new elements counts.
"""
if key in dictionary.keys():
dictionary[key] += 1
else:
dictionary[key] = 1
return dictionary
[docs]def dict_from_list_with_element_count(input_list):
"""Put input list elements into dictionary with count.
Parameters
----------
input_list : list
This is the list with elements with some duplicates present.
Returns
-------
dict
This is dictionary key as list elements and value as list each element count
"""
output_dict = dict()
for key in input_list:
add_dict_element_with_count(output_dict, key)
return output_dict
[docs]def validate_column_details_between_two_record_list(first_list_of_dict: list, second_list_of_dict: list,
first_column_name: str = "cleaned_title", second_column_name: str =
'cleaned_title_pdf') -> tuple:
"""It produce list of matched columns rows and unmatched column rows based on same column from first list of dict.
Note- emphasis on first list as function check all records of first list of dict in second list of dict.
title column of second_list_of_dict is kept by merging with first.
Parameters
----------
second_column_name : str
This is the name of column which contain pdf article title.
first_list_of_dict : list
Iterable object pandas.DataFrame or list which contains first_column_name
second_list_of_dict : list
Iterable object pandas.DataFrame or list which contains first_column_name
first_column_name : str
This is the name of column which contain citation title.
Returns
-------
tuple
matched_list - It contains column's row which are matched in both data object.
unmatched_list - It contains column's row which are unmatched in both data object.
"""
matched_list = []
unmatched_list = []
for article_name in first_list_of_dict:
validation_bool, percentage_matched, method = True, 0, None
for article_count in second_list_of_dict:
validation_bool, percentage_matched, method = multiple_methods_validating_words_string_in_text(
article_name[first_column_name], article_count[second_column_name])
# print(f"validation_bool: {validation_bool}, percentage_matched: {percentage_matched},
# text_manipulation_method_name: {text_manipulation_method_name}")
if validation_bool:
article_name_count = {**article_name, **article_count}
matched_list.append(article_name_count)
break
if not validation_bool:
unmatched_list.append([article_name[first_column_name], percentage_matched, method])
print(f"matched_list count = {len(matched_list)}, unmatched_list count = {len(unmatched_list)}")
return matched_list, unmatched_list
[docs]def deep_validate_column_details_between_two_record_list(first_list_of_dict: list, second_list_of_dict: list,
first_column_name: str = "cleaned_title",
second_column_name: str = 'cleaned_title_pdf') -> tuple:
"""It produce list of matched columns rows and unmatched column rows based on same column from both.
Parameters
----------
second_column_name : str
This is the name of column which contain pdf article title.
first_list_of_dict : list
Iterable object pandas.DataFrame or list which contains first_column_name
second_list_of_dict : list
Iterable object pandas.DataFrame or list which contains first_column_name
first_column_name : str
This is the name of column which contain citation title.
Returns
-------
tuple
matched_list - It contains column's row which are matched in both data object.
unmatched_list - It contains column's row which are unmatched in both data object.
"""
import copy
temp_first_list_of_dict = copy.deepcopy(first_list_of_dict)
temp_second_list_of_dict = copy.deepcopy(second_list_of_dict)
matched_list = []
for first_dict in temp_first_list_of_dict:
if first_column_name in first_dict:
for second_dict in temp_second_list_of_dict:
if second_column_name in second_dict:
if first_dict[first_column_name] == second_dict[second_column_name]:
article_name_count = {**first_dict, **second_dict}
matched_list.append(article_name_count)
temp_first_list_of_dict.remove(first_dict)
temp_second_list_of_dict.remove(second_dict)
break
unmatched_list = temp_first_list_of_dict + temp_second_list_of_dict
return matched_list, unmatched_list
[docs]def compare_two_dict_members_via_percent_similarity(first_dict: dict, second_dict: dict) -> float:
"""Compare elements in 2 dictionaries and return percentage similarity.
Parameters
----------
first_dict : dict
Example - first_dict = {'mixed':1, 'modified':1, 'fruit':1, 'fly':1, 'optimization':1}
second_dict : dict
Example - second_dict = {'mixed':1, 'modified':1, 'fruit':1, 'fly':1, 'optimization':1, 'algorithm': 1}
Returns
-------
float
This is percentage represented as decimal number. Example 98.36065573770492 that means 98.35%
"""
similar_dict_keys_count = 0
total_dict_keys_count = 0
all_dict_keys = {**first_dict, **second_dict}
for key, value in all_dict_keys.items():
if key in first_dict and key in second_dict:
if first_dict[key] == second_dict[key]:
same_values_in_dict = (2 * first_dict[key])
similar_dict_keys_count += same_values_in_dict
total_dict_keys_count += same_values_in_dict
else:
diff = abs(first_dict[key] - second_dict[key])
same_values_in_dict = first_dict[key] if first_dict[key] < second_dict[key] else second_dict[key]
similar_dict_keys_count += same_values_in_dict
total_dict_keys_count += (diff + same_values_in_dict)
else:
total_dict_keys_count += all_dict_keys[key]
percent_similarity = calculate_percentage(similar_dict_keys_count, total_dict_keys_count)
return percent_similarity
[docs]def compare_two_list_members_via_percent_similarity(words_list: list, boolean_membership_list: list) -> float:
"""Compare elements in 2 lists and return percentage similarity.
Parameters
----------
words_list : list
This contains elements whose elements to be checked for similarity.
boolean_membership_list : list
This list contains True and False values.
Returns
-------
float
This is percentage represented as decimal number. Example 98.36065573770492 that means 98.35%
"""
words_found_in_boolean_membership_list = 0
length_of_words_list = len(words_list)
for word_indicator in boolean_membership_list:
if word_indicator:
words_found_in_boolean_membership_list += 1
percent_similarity = calculate_percentage(words_found_in_boolean_membership_list, length_of_words_list)
return percent_similarity
[docs]def exact_words_checker_in_text(words_string: str, text_string: str) -> bool:
"""This checks for exact match of substring in string and return True or False based on success.
Parameters
----------
words_string : str
This is the word we are searching for.
text_string : str
This is query string or lengthy text.
Returns
-------
bool
This returns True if exact words_string found in text_string else False.
"""
if (type(words_string) != str) or (type(text_string) != str):
raise TypeError
words_list = string_manipulation.split_preprocess_string(words_string)
words_list_length = len(words_list)
words_list_end_element_index = words_list_length - 1
words_set = set(words_list)
# words_dict_membership = dict_from_list_with_element_count(words_list)
text_list = string_manipulation.split_preprocess_string(text_string)
validation_bool = False
searching_flag = False
for word_of_text in text_list:
if searching_flag:
if word_of_text == words_list[searching_index]:
if searching_index == words_list_end_element_index:
validation_bool = True
return validation_bool
searching_index += 1
else:
searching_flag = False
if word_of_text in words_set:
if word_of_text == words_list[0]:
# starting_index = words_list.index(word_of_text)
searching_flag = True
searching_index = 1
return validation_bool
[docs]def words_percentage_checker_in_text(words_string: str, text_string: str, validation_limit: float = 70) -> tuple:
"""This checks for exact match of substring in string and return True or False based on success. It also returns
matched word percentage.
Limit: this doesn't work properly if words_string have duplicate words.
Parameters
----------
words_string : str
This is the word we are searching for.
text_string : str
This is query string or lengthy text.
validation_limit : float
This is the limit on similarity of checked substring. Example - 0.5 will return true if half of word found same.
Returns
-------
tuple
This returns True if exact words_string found in text_string else False.
This also returns matched substring percentage.
"""
words_list = string_manipulation.split_preprocess_string(words_string)
words_list_length = len(words_list)
# words_list_end_element_index = words_list_length - 1
words_set = set(words_list)
# words_dict_membership = dict_from_list_with_element_count(words_list)
text_list = string_manipulation.split_preprocess_string(text_string)
temp_list = [False] * words_list_length
validation_bool = False
# searching_flag = False
word_list_element_index = -1
percentage_matched = 0
for word_of_text in text_list:
if word_of_text in words_set:
word_of_text_index_in_words_list = words_list.index(word_of_text)
if word_of_text_index_in_words_list > word_list_element_index:
word_list_element_index = word_of_text_index_in_words_list
temp_list[word_of_text_index_in_words_list] = True
percentage_matched = compare_two_list_members_via_percent_similarity(words_list, temp_list)
validation_bool = True if percentage_matched > validation_limit else False
if validation_bool:
return validation_bool, percentage_matched
else:
temp_list = [False] * words_list_length
temp_list[word_of_text_index_in_words_list] = True
else:
temp_list = [False] * words_list_length
word_list_element_index = -1
return validation_bool, percentage_matched
[docs]def jumbled_words_percentage_checker_in_text(words_string: str, text_string: str, validation_limit: float = 70,
wrong_word_limit: int = 2) -> tuple:
"""start calculating percentage if half of words are found in sequence. This also takes in consideration of words
which got jumbled up due to pdf reading operation.
Parameters
----------
words_string : str
This is the word we are searching for.
text_string : str
This is query string or lengthy text.
validation_limit : float
This is the limit on similarity of checked substring. Example - 0.5 will return true if half of word found same.
wrong_word_limit : int
This is the limit unto which algorithm ignore the wrong word in sequence.
Returns
-------
tuple
This returns True if exact words_string found in text_string else False.
This also returns matched substring percentage.
"""
words_list = string_manipulation.split_preprocess_string(words_string)
# words_list_length = len(words_list)
# words_list_end_element_index = words_list_length - 1
words_set = set(words_list)
words_dict_membership = dict_from_list_with_element_count(words_list)
text_list = string_manipulation.split_preprocess_string(text_string)
validation_bool = False
percentage_matched = 0
skipped_words = 0
temp_dict = dict()
for word_of_text in text_list:
if word_of_text in words_set:
skipped_words = 0
add_dict_element_with_count(temp_dict, word_of_text)
else:
skipped_words += 1
if skipped_words >= wrong_word_limit:
temp_dict = dict()
continue
percentage_matched = compare_two_dict_members_via_percent_similarity(words_dict_membership, temp_dict)
validation_bool = True if percentage_matched > validation_limit else False
if validation_bool:
return validation_bool, percentage_matched
temp_dict = dict()
return validation_bool, percentage_matched
[docs]def validating_pdf_via_filename(pdf_file_path: str, pages: str = "first", method: str = "exact_words") -> bool:
"""This function checks name of file and find the name in the text of pdf file. if it become successful then pdf is
validated as downloaded else not downloaded. Example - pdf file name -> check in -> text of pdf file. pdf_reader
options are pdftotext or pymupdf.
Parameters
----------
pdf_file_path : str
the path of the pdf file.
pages : str
This could be 'all' to get full text of pdf and 'first' for first page of pdf.
method : str
This is the switch option to select text_manipulation_method_name from exact_words, words_percentage, jumbled_words_percentage.
Returns
-------
bool
True and False value depicting validated article with True value.
"""
text = converter.get_text_from_pdf(pdf_file_path, pages)
# print(text)
pdf_filename = os_utils.get_filename_from_path(pdf_file_path)
pdf_filename = string_manipulation.strip_string_from_right_side(pdf_filename)
if method == "exact_words":
validation_bool = exact_words_checker_in_text(pdf_filename, text)
elif method == "words_percentage":
validation_bool, percentage_matched = words_percentage_checker_in_text(pdf_filename, text)
elif method == "jumbled_words_percentage":
validation_bool, percentage_matched = jumbled_words_percentage_checker_in_text(pdf_filename, text)
else:
validation_bool = False
print(
"Please properly write the text_manipulation_method_name name, as text_manipulation_method_name name is not available")
return validation_bool
[docs]def multiple_methods_validating_pdf_via_filename(pdf_file_path: str, pages: str = "first",
pdf_reader: str = 'pdftotext') -> tuple:
"""This function checks name of file and find the name in the text of pdf file. if it become successful then pdf is
validated as downloaded else not downloaded. Example - pdf file name -> check in -> text of pdf file. pdf_reader
options are pdftotext or pymupdf.
Parameters
----------
pdf_reader : str
This is python pdf reader package which convert pdf to text.
pdf_file_path : str
the path of the pdf file.
pages : str
This could be 'all' to get full text of pdf and 'first' for first page of pdf.
Returns
-------
tuple
True and False value depicting validated article with True value.
This also shows percentage matched
Last it shows the text_manipulation_method_name used. like exact_words, words_percentage, jumbled_words_percentage, all if every text_manipulation_method_name
is executed to validate.
"""
# percentage_matched = 0
text = converter.get_text_from_pdf(pdf_file_path, pages, pdf_reader)
# print(text)
pdf_filename = os_utils.get_filename_from_path(pdf_file_path)
return multiple_methods_validating_words_string_in_text(pdf_filename, text)
[docs]def validating_multiple_pdfs_via_filenames(list_of_pdf_files_path: list, pages: str = "first",
pdf_reader: str = 'pdftotext') -> tuple:
"""This function checks pdf files in list_of_pdf_files_path and validate them with function named
'validating_pdf_via_filename'. Example - multiple pdf file name -> check in -> text of pdf file.
pdf_reader options are pdftotext or pymupdf.
Parameters
----------
pages : str
This could be 'all' to get full text of pdf and 'first' for first page of pdf.
pdf_reader : str
This is python pdf reader package which convert pdf to text.
list_of_pdf_files_path : list
the list of the path of the pdf file.
Returns
-------
tuple
validated_pdf_list - contains name of pdf files whose filename is in the pdf text
invalidated_pdf_list - list of name of files which can't be included in validated_pdf_list
manual_pdf_list - list of files which can't be opened using python pdf reader or errors opening them.
"""
validated_pdf_list = []
invalidated_pdf_list = []
manual_pdf_list = []
try:
import pdftotext
except ImportError:
print("""This function requires pdftotext library to read pdfs.
step 1. install OS Dependencies:
These instructions assume you're using Python 3 on a recent OS.
- Debian, Ubuntu, and friends
sudo apt install build-essential libpoppler-cpp-dev pkg-config python3-dev
- Fedora, Red Hat, and friends
sudo yum install gcc-c++ pkgconfig poppler-cpp-devel python3-devel
- macOS
brew install pkg-config poppler python
- Windows (Install poppler through conda)
conda install -c conda-forge poppler
step 2. Install pdftotext
pip install pdftotext
for more info, please visit https://pypi.org/project/pdftotext/""")
for article_name_path in list_of_pdf_files_path:
try:
value, percentage_matched, methods = multiple_methods_validating_pdf_via_filename(article_name_path,
pages, pdf_reader)
if value:
# print("validated")
validated_pdf_list.append([article_name_path, percentage_matched, methods])
elif not value:
# print("invalidated")
invalidated_pdf_list.append([article_name_path, percentage_matched, methods])
except Exception:
manual_pdf_list.append([article_name_path, 0, None])
return validated_pdf_list, invalidated_pdf_list, manual_pdf_list
[docs]class ValidateWordsInText:
"""This checks words in given Text.
"""
def __init__(self, words_string: str, text_string: str,
words_percentage_checker_in_text_validation_limit: float = 70,
jumbled_words_percentage_checker_in_text_validation_limit: float = 70,
jumbled_words_percentage_checker_in_text_wrong_word_limit: int = 2):
"""
Parameters
----------
words_string : str
This is the word we are searching for.
text_string : str
This is query string or lengthy text.
jumbled_words_percentage_checker_in_text_wrong_word_limit : float
This is the limit on similarity of checked substring. Example - 0.5 will return true if half of word found
same.
jumbled_words_percentage_checker_in_text_validation_limit : int
This is the limit unto which algorithm ignore the wrong word in sequence.
words_percentage_checker_in_text_validation_limit : float
This is the limit on similarity of checked substring. Example - 0.5 will return true if half of word found
same.
"""
self.words_percentage_checker_in_text_validation_limit = words_percentage_checker_in_text_validation_limit
self.jumbled_words_percentage_checker_in_text_validation_limit = \
jumbled_words_percentage_checker_in_text_validation_limit
self.jumbled_words_percentage_checker_in_text_wrong_word_limit = \
jumbled_words_percentage_checker_in_text_wrong_word_limit
self.text_string = text_string
self.words_string = words_string
[docs] def exact_words_checker_in_text(self) -> bool:
"""This checks for exact match of substring in string and return True or False based on success.
Returns
-------
bool
This returns True if exact words_string found in text_string else False.
"""
if (type(self.words_string) != str) or (type(self.text_string) != str):
raise TypeError
words_list = string_manipulation.split_preprocess_string(self.words_string)
words_list_length = len(words_list)
words_list_end_element_index = words_list_length - 1
words_set = set(words_list)
# words_dict_membership = dict_from_list_with_element_count(words_list)
text_list = string_manipulation.split_preprocess_string(self.text_string)
validation_bool = False
searching_flag = False
for word_of_text in text_list:
if searching_flag:
if word_of_text == words_list[searching_index]:
if searching_index == words_list_end_element_index:
validation_bool = True
return validation_bool
searching_index += 1
else:
searching_flag = False
if word_of_text in words_set:
if word_of_text == words_list[0]:
# starting_index = words_list.index(word_of_text)
searching_flag = True
searching_index = 1
return validation_bool
[docs] def words_percentage_checker_in_text(self) -> tuple:
"""This checks for exact match of substring in string and return True or False based on success. It also returns
matched word percentage.
words_percentage_checker_in_text_validation_limit: this doesn't work properly if words_string have duplicate words.
Returns
-------
tuple
This returns True if exact words_string found in text_string else False.
This also returns matched substring percentage.
"""
words_list = string_manipulation.split_preprocess_string(self.words_string)
words_list_length = len(words_list)
# words_list_end_element_index = words_list_length - 1
words_set = set(words_list)
# words_dict_membership = dict_from_list_with_element_count(words_list)
text_list = string_manipulation.split_preprocess_string(self.text_string)
temp_list = [False] * words_list_length
validation_bool = False
# searching_flag = False
word_list_element_index = -1
percentage_matched = 0
for word_of_text in text_list:
if word_of_text in words_set:
word_of_text_index_in_words_list = words_list.index(word_of_text)
if word_of_text_index_in_words_list > word_list_element_index:
word_list_element_index = word_of_text_index_in_words_list
temp_list[word_of_text_index_in_words_list] = True
percentage_matched = compare_two_list_members_via_percent_similarity(words_list, temp_list)
validation_bool = True if \
percentage_matched > self.words_percentage_checker_in_text_validation_limit else False
if validation_bool:
return validation_bool, percentage_matched
else:
temp_list = [False] * words_list_length
temp_list[word_of_text_index_in_words_list] = True
else:
temp_list = [False] * words_list_length
word_list_element_index = -1
return validation_bool, percentage_matched
[docs] def jumbled_words_percentage_checker_in_text(self) -> tuple:
"""start calculating percentage if half of words are found in sequence. This also takes in consideration of words
which got jumbled up due to pdf reading operation.
Returns
-------
tuple
This returns True if exact words_string found in text_string else False.
This also returns matched substring percentage.
"""
words_list = string_manipulation.split_preprocess_string(self.words_string)
# words_list_length = len(words_list)
# words_list_end_element_index = words_list_length - 1
words_set = set(words_list)
words_dict_membership = dict_from_list_with_element_count(words_list)
text_list = string_manipulation.split_preprocess_string(self.text_string)
validation_bool = False
percentage_matched = 0
skipped_words = 0
temp_dict = dict()
for word_of_text in text_list:
if word_of_text in words_set:
skipped_words = 0
add_dict_element_with_count(temp_dict, word_of_text)
else:
skipped_words += 1
if skipped_words >= self.jumbled_words_percentage_checker_in_text_wrong_word_limit:
temp_dict = dict()
continue
percentage_matched = compare_two_dict_members_via_percent_similarity(words_dict_membership, temp_dict)
validation_bool = True if \
percentage_matched > self.jumbled_words_percentage_checker_in_text_validation_limit else False
if validation_bool:
return validation_bool, percentage_matched
temp_dict = dict()
return validation_bool, percentage_matched
[docs] def multiple_methods(self) -> tuple:
"""This text_manipulation_method_name uses different methods to validate the article_name(substring) in text.
Example - exact_words, words_percentage, jumbled_words_percentage.
Returns
-------
tuple
True and False value depicting validated article with True value.
This also shows percentage matched
Last it shows the text_manipulation_method_name used. like exact_words, words_percentage, jumbled_words_percentage, all if every text_manipulation_method_name
is executed to validate.
"""
# percentage_matched = 0
validation_bool = exact_words_checker_in_text(self.words_string, self.text_string)
if validation_bool:
return validation_bool, 1, "exact_words"
validation_bool, percentage_matched = words_percentage_checker_in_text(
self.words_string, self.text_string, self.words_percentage_checker_in_text_validation_limit)
if validation_bool:
return validation_bool, percentage_matched, "words_percentage"
validation_bool, percentage_matched = jumbled_words_percentage_checker_in_text(
self.words_string, self.text_string, self.jumbled_words_percentage_checker_in_text_validation_limit,
self.jumbled_words_percentage_checker_in_text_wrong_word_limit)
if validation_bool:
return validation_bool, percentage_matched, "jumbled_words_percentage"
return validation_bool, percentage_matched, "all"
[docs]def multiple_methods_validating_words_string_in_text(
article_name: str, text: str,
words_percentage_checker_in_text_validation_limit: float = 70,
jumbled_words_percentage_checker_in_text_validation_limit: float = 70,
jumbled_words_percentage_checker_in_text_wrong_word_limit: int = 2) -> tuple:
"""This text_manipulation_method_name uses different methods to validate the article_name(substring) in text. Example - exact_words,
words_percentage, jumbled_words_percentage.
Parameters
----------
jumbled_words_percentage_checker_in_text_wrong_word_limit : float
This is the limit on similarity of checked substring. Example - 0.5 will return true if half of word found same.
jumbled_words_percentage_checker_in_text_validation_limit : int
This is the limit unto which algorithm ignore the wrong word in sequence.
words_percentage_checker_in_text_validation_limit : float
This is the limit on similarity of checked substring. Example - 0.5 will return true if half of word found same.
article_name : str
This is input string which we want to validate in text.
text : str
This is query string or lengthy text.
Returns
-------
tuple
True and False value depicting validated article with True value.
This also shows percentage matched
Last it shows the text_manipulation_method_name used. like exact_words, words_percentage, jumbled_words_percentage, all if every text_manipulation_method_name
is executed to validate.
"""
# percentage_matched = 0
validation_bool = exact_words_checker_in_text(article_name, text)
if validation_bool:
return validation_bool, 1, "exact_words"
validation_bool, percentage_matched = words_percentage_checker_in_text(
article_name, text, words_percentage_checker_in_text_validation_limit)
if validation_bool:
return validation_bool, percentage_matched, "words_percentage"
validation_bool, percentage_matched = jumbled_words_percentage_checker_in_text(
article_name, text, jumbled_words_percentage_checker_in_text_validation_limit,
jumbled_words_percentage_checker_in_text_wrong_word_limit)
if validation_bool:
return validation_bool, percentage_matched, "jumbled_words_percentage"
return validation_bool, percentage_matched, "all"
[docs]def finding_missed_articles_from_downloading(validated_pdf_list: list, original_articles_list: list) -> tuple:
"""Checks how many articles are not downloaded yet from original list of articles.
Parameters
----------
validated_pdf_list : list
Contains name of pdf files whose filename is in the pdf text.
original_articles_list : list
This is original list from where we started downloading the articles.
Returns
-------
tuple
Missing_articles - these are the articles which are missed from downloading.
Validated_articles - This is list of validated downloaded articles list.
"""
validated_pdf_text = converter.list_to_string(validated_pdf_list)
original_articles_set = set(original_articles_list)
missing_articles = []
downloaded_articles = []
for article_name in original_articles_set:
validation_bool, percentage_matched, methods = multiple_methods_validating_words_string_in_text(
article_name, validated_pdf_text)
if not validation_bool:
missing_articles.append(article_name)
elif validation_bool:
downloaded_articles.append(article_name)
return missing_articles, downloaded_articles
[docs]def get_missed_original_articles_list(original_article_list: list, downloaded_article_list: list) -> list:
"""This check elemets of the original_article_list in downloaded_article_list and return missed articles list.
Parameters
----------
original_article_list : list
This list elements are checked if they are present in other list.
downloaded_article_list : list
This list is checked if it consists elements of other list
Returns
-------
list
This contains missing elements of original_article_list in downloaded_article_list.
"""
missed_articles_list = []
for article_name in original_article_list:
if article_name in set(downloaded_article_list):
pass
else:
missed_articles_list.append(article_name)
return missed_articles_list
[docs]def get_missed_articles_dataframe(filter_sorted_citations_df: pd.DataFrame, downloaded_articles_path: str,
title_column_name: str = "cleaned_title") -> list:
"""return list of missed articles from downloading by checking original list of articles from
filter_sorted_citations_df using downloaded articles path.
Parameters
----------
title_column_name : str
contains name of column which contain the name of article.
filter_sorted_citations_df : pd.DataFrame
This dataframe contains records of selected articles including name of articles.
downloaded_articles_path : str
contains parent folder of all the downloaded articles files.
Returns
-------
list
list of the missed articles from downloading.
"""
original_list = [i for i in filter_sorted_citations_df[title_column_name]]
validated_articles_list, invalidated_list, manual_list = validating_pdfs_using_multiple_pdf_reader(
downloaded_articles_path)
articles_list = getting_article_paths_from_validation_detail(validated_articles_list)
downloaded_list = [
string_manipulation.preprocess_string(os_utils.get_filename_from_path(k))
for k in articles_list]
missed_articles = finding_missed_articles_from_downloading(downloaded_list, original_list)
return missed_articles[0]
[docs]def getting_article_paths_from_validation_detail(list_of_validation: list) -> list:
"""Getting the first element from list of lists.
Parameters
----------
list_of_validation : list
This list contain list of three values where the first is article path.
Returns
-------
list
This output list contains the articles paths
"""
article_list = [i[0] for i in list_of_validation]
return article_list
[docs]def validating_pdfs_using_multiple_pdf_reader(pdfs_parent_dir_path: str) -> tuple:
"""This function uses two python readers pdftotext and pymupdf for validating if the filename are present inside of
pdf file text.
Parameters
----------
pdfs_parent_dir_path : str
This is the parent directory of all the downloaded pdfs.
Returns
-------
tuple
validated_pdf_list - contains name of pdf files whose filename is in the pdf text
invalidated_pdf_list - list of name of files which can't be included in validated_pdf_list
manual_pdf_list - list of files which can't be opened using python pdf reader or errors opening them.
"""
articles_paths = os_utils.extract_files_path_from_directories_or_subdirectories(
pdfs_parent_dir_path)
print(f"Total number of articles: {len(articles_paths)}")
validated_list, invalidated_list, manual_list = validating_multiple_pdfs_via_filenames(articles_paths)
print(f"Using pdftotext reader to validate:")
print(f"Number of validated articles : {len(validated_list)}\n"
f"Number of invalidated articles : {len(invalidated_list)}\n "
f"Number of articles to open manually: {len(manual_list)}")
print("validating invalidated articles using other pdf reader.")
temp_invalidated_list, temp_manual_list = [], []
if len(invalidated_list) != 0:
temp_invalidated_list = getting_article_paths_from_validation_detail(invalidated_list)
if len(manual_list) != 0:
temp_manual_list = getting_article_paths_from_validation_detail(manual_list)
invalidated_list = temp_invalidated_list + temp_manual_list
temp_validated_list, temp_invalidated_list, temp_manual_list = validating_multiple_pdfs_via_filenames(
invalidated_list, pdf_reader="pymupdf")
print(f"Using pymupdf reader to validate:")
print(f"Number of validated articles : {len(temp_validated_list)}\n"
f"Number of invalidated articles : {len(temp_invalidated_list)}\n "
f"Number of articles to open manually: {len(temp_manual_list)}")
validated_list += temp_validated_list
invalidated_list = temp_invalidated_list
manual_list = temp_manual_list
print("Finally, using both python pdf readers:")
print(f"Number of validated articles : {len(validated_list)}\n"
f"Number of invalidated articles : {len(invalidated_list)}\n "
f"Number of articles to open manually: {len(manual_list)}")
return validated_list, invalidated_list, manual_list
[docs]def manual_validating_of_pdf(articles_path_list: list, manual_index: int) -> tuple:
"""This is mostly a manually used function to validate some pdfs at the end of validation process. It makes it easy
to search and validate pdf and store in a list.
Advice: convert these lists as text file using function in converter module to avoid data loss.
Parameters
----------
articles_path_list : list
These are the list of articles which skipped our automated screening and validation algorithms. mostly due to
pdf to text conversions errors.
manual_index : list
This is the index from where you will start checking in article_path_list. Normally in many tries.
Returns
-------
tuple
external_validation_list - This is the list to be saved externally for validated articles.
external_invalidated_list - This is the list to be saved externally for invalidated articles.
"""
external_validation_list = []
external_invalidated_list = []
article_path = articles_path_list[manual_index]
print(article_path)
instructions = "Please provide 'y' to validate or 'n' to invalidate"
manual_input = input(instructions).lower()
if manual_input.lower() == "y":
external_validation_list.append(article_path)
elif manual_input.lower() == "n":
external_invalidated_list.append(article_path)
else:
print("input should be 'y' or 'n'")
manual_index += 1
return external_validation_list, external_invalidated_list
[docs]class Validation:
"""This is used to validate the downloaded files.
"""
download_flag_column_name = 'downloaded'
research_paper_file_location_column_name = 'file location'
validation_method_column_name = "validation method"
validation_manual_method_name = "manual"
cleaned_article_column_name = 'cleaned_title'
file_manual_check_flag_name = "unreadable"
file_validated_flag_name = "yes"
file_invalidated_flag_name = "wrong"
file_not_downloaded_flag_name = "no"
file_not_accessible_flag_name = "no access"
def __init__(self, citations_data: Union[List[dict], pd.DataFrame],
parents_directory_of_research_papers_files: str,
text_file_path_of_inaccessible_research_papers: str = None,
text_manipulation_method_name: str = "preprocess_string_to_space_separated_words",
words_percentage_checker_in_text_validation_limit: float = 70,
jumbled_words_percentage_checker_in_text_validation_limit: float = 70,
jumbled_words_percentage_checker_in_text_wrong_word_limit: int = 2
):
"""
Parameters
----------
citations_data : Union[List[dict], pd.DataFrame]
This contains citation data which we are validating.
parents_directory_of_research_papers_files : str
This is parent directory of all the downloaded files for citation data.
text_file_path_of_inaccessible_research_papers : str
This is the path of text file containing non-accessible research papers separated by newline.
text_manipulation_method_name : str
provides the options to use any text manipulation function.
preprocess_string (default and applied before all other implemented functions)
custom_text_manipulation_function - for putting your custom_text_manipulation_function function to
preprocess the text nltk_remove_stopwords, pattern_lemma_or_lemmatize_text, nltk_word_net_lemmatizer,
nltk_porter_stemmer, nltk_lancaster_stemmer, spacy_lemma, nltk_remove_stopwords_spacy_lemma,
convert_string_to_lowercase, preprocess_string_to_space_separated_words
jumbled_words_percentage_checker_in_text_wrong_word_limit : float
This is the limit on similarity of checked substring. Example - 0.5 will return true if half of word found same.
jumbled_words_percentage_checker_in_text_validation_limit : int
This is the limit unto which algorithm ignore the wrong word in sequence.
words_percentage_checker_in_text_validation_limit : float
This is the limit on similarity of checked substring. Example - 0.5 will return true if half of word found same.
"""
self.jumbled_words_percentage_checker_in_text_wrong_word_limit = \
jumbled_words_percentage_checker_in_text_wrong_word_limit
self.jumbled_words_percentage_checker_in_text_validation_limit = \
jumbled_words_percentage_checker_in_text_validation_limit
self.words_percentage_checker_in_text_validation_limit = words_percentage_checker_in_text_validation_limit
self.text_manipulation_method_name = text_manipulation_method_name
self.text_file_path_of_inaccessible_research_papers = text_file_path_of_inaccessible_research_papers
self.parents_directory_of_research_papers_files = parents_directory_of_research_papers_files
self.citations_records_list = converter.dataframe_to_records_list(citations_data) \
if type(citations_data) == pd.DataFrame else citations_data
self.research_papers_list = self.add_downloaded_flag_column_and_file_location_column()
self.file_name_and_path_mapping = self.file_name_and_path_dict()
[docs] def add_downloaded_flag_column_and_file_location_column(self):
"""add empty columns based on research_paper_file_location_column_name and download_flag_column_name
Returns
-------
List[dict]
data contains new columns.
"""
import copy
complete_citations_records_list = copy.deepcopy(self.citations_records_list)
inaccessible_research_papers_set = set([string_manipulation.text_manipulation_methods(
article_name, self.text_manipulation_method_name) for article_name in converter.text_file_to_list(
self.text_file_path_of_inaccessible_research_papers)]) if \
self.text_file_path_of_inaccessible_research_papers else self.text_file_path_of_inaccessible_research_papers
for record in complete_citations_records_list:
if inaccessible_research_papers_set and \
(record[self.cleaned_article_column_name] in inaccessible_research_papers_set):
record[self.download_flag_column_name] = self.file_not_accessible_flag_name
else:
record[self.download_flag_column_name] = self.file_not_downloaded_flag_name
record[self.research_paper_file_location_column_name] = ""
record[self.validation_method_column_name] = ""
return complete_citations_records_list
[docs] def file_name_and_path_dict(self):
"""contains mapping of filename to file paths
Returns
-------
dict
key is filename and value is file paths.
"""
file_name_and_path = {}
articles_paths = os_utils.extract_files_path_from_directories_or_subdirectories(
self.parents_directory_of_research_papers_files)
for path in articles_paths:
article_name = os_utils.get_filename_from_path(path)
clean_article_name = string_manipulation.text_manipulation_methods(
article_name, self.text_manipulation_method_name)
file_name_and_path[clean_article_name] = path
return file_name_and_path
[docs] def check(self):
"""Executes the validation of research articles in citation data by checking the research paper files and
validating if the research articles are correct.
Returns
-------
List[dict]
data contains columns with downloaded, validation method and file path columns
"""
for citation in self.research_papers_list:
if (citation[self.download_flag_column_name].lower() == "no") and (
citation[self.cleaned_article_column_name] in self.file_name_and_path_mapping):
research_paper = converter.Reader(
self.file_name_and_path_mapping[citation[self.cleaned_article_column_name]])
file_extension = research_paper.file_extension
if file_extension == 'pdf':
text = research_paper.pdf_pdftotext_reader()
if text:
validation_result = ValidateWordsInText(
citation[self.cleaned_article_column_name], text,
self.words_percentage_checker_in_text_validation_limit,
self.jumbled_words_percentage_checker_in_text_validation_limit,
self.jumbled_words_percentage_checker_in_text_wrong_word_limit).multiple_methods()
if validation_result[0]:
citation[self.download_flag_column_name] = self.file_validated_flag_name
citation[self.research_paper_file_location_column_name] = self.file_name_and_path_mapping[
citation[self.cleaned_article_column_name]]
citation[self.validation_method_column_name] = validation_result[2]
continue
text = research_paper.pdf_pymupdf_reader()
if not text:
citation[self.download_flag_column_name] = self.file_manual_check_flag_name
continue
validation_result = ValidateWordsInText(
citation[self.cleaned_article_column_name], text,
self.words_percentage_checker_in_text_validation_limit,
self.jumbled_words_percentage_checker_in_text_validation_limit,
self.jumbled_words_percentage_checker_in_text_wrong_word_limit).multiple_methods()
if validation_result[0]:
citation[self.download_flag_column_name] = self.file_validated_flag_name
citation[self.research_paper_file_location_column_name] = self.file_name_and_path_mapping[
citation[self.cleaned_article_column_name]]
citation[self.validation_method_column_name] = validation_result[2]
else:
citation[self.download_flag_column_name] = self.file_invalidated_flag_name
citation[self.research_paper_file_location_column_name] = self.file_name_and_path_mapping[
citation[self.cleaned_article_column_name]]
citation[self.validation_method_column_name] = validation_result[2]
else:
text = research_paper.get_text()
if not text:
citation[self.download_flag_column_name] = self.file_manual_check_flag_name
continue
validation_result = ValidateWordsInText(
citation[self.cleaned_article_column_name], text,
self.words_percentage_checker_in_text_validation_limit,
self.jumbled_words_percentage_checker_in_text_validation_limit,
self.jumbled_words_percentage_checker_in_text_wrong_word_limit).multiple_methods()
if validation_result[0]:
citation[self.download_flag_column_name] = self.file_validated_flag_name
citation[self.research_paper_file_location_column_name] = self.file_name_and_path_mapping[
citation[self.cleaned_article_column_name]]
citation[self.validation_method_column_name] = validation_result[2]
else:
citation[self.download_flag_column_name] = self.file_invalidated_flag_name
citation[self.research_paper_file_location_column_name] = self.file_name_and_path_mapping[
citation[self.cleaned_article_column_name]]
citation[self.validation_method_column_name] = validation_result[2]
return self.research_papers_list
[docs] def get_records_list(self) -> List[Dict[str, Any]]:
"""Outputs the records list containing validation results of input data.
Returns
-------
List[Dict[str, Any]]
This is the list of records which contains validation Flags column downloaded with values- "yes", "no",
"wrong", "no access", "unreadable" and file location column if downloaded column contains "yes".
"""
return self.check()
[docs] def get_dataframe(self):
"""Outputs the pandas.DataFrame containing validation results of input data.
Returns
-------
pandas.DataFrame
This is the dataframe which contains validation Flags column downloaded with values- "yes", "no",
"wrong", "no access", "unreadable" and file location column if downloaded column contains "yes".
"""
return converter.records_list_to_dataframe(self.check())
[docs] def info(self):
"""Equivalent to pandas.DataFrame.value_counts(), It return list with count of unique element in column
Returns
-------
object
unique download_flag_column_name elements with counts
"""
return converter.dataframe_column_counts(self.get_dataframe(), self.download_flag_column_name)
[docs] def to_csv(self, output_filename: Union[str, None] = "output.csv", index: bool = True):
"""This function saves pandas.DataFrame to csv file.
Parameters
----------
output_filename : str
This is the name of output file which should contains .csv extension
index : bool
Define if index is needed in output csv file or not.
Returns
-------
"""
converter.dataframe_to_csv_file(self.get_dataframe(), output_filename, index)
[docs] def to_excel(self, output_filename: Union[str, None] = "output.csv", index: bool = True):
"""This function saves pandas.DataFrame to excel file.
Parameters
----------
output_filename : str
This is the name of output file which should contains .xlsx extension
index : bool
Define if index is needed in output excel file or not.
Returns
-------
"""
converter.dataframe_to_excel_file(self.get_dataframe(), output_filename, index)