nlp-articleprocessor
Science Score: 31.0%
This score indicates how likely this project is to be science-related based on various indicators:
-
✓CITATION.cff file
Found CITATION.cff file -
✓codemeta.json file
Found codemeta.json file -
○.zenodo.json file
-
○DOI references
-
○Academic links in README
-
○Academic email domains
-
○Institutional organization owner
-
○JOSS paper metadata
-
○Scientific vocabulary similarity
Unable to calculate vocabulary similarity
Last synced: 10 months ago
·
JSON representation
·
Repository
Basic Info
- Host: GitHub
- Owner: RenatoVA
- Language: Jupyter Notebook
- Default Branch: master
- Size: 26.4 MB
Statistics
- Stars: 0
- Watchers: 1
- Forks: 0
- Open Issues: 0
- Releases: 0
Created almost 2 years ago
· Last pushed 11 months ago
Metadata Files
Citation
Owner
- Login: RenatoVA
- Kind: user
- Repositories: 1
- Profile: https://github.com/RenatoVA
Citation (citations_model.py)
import requests
from bs4 import BeautifulSoup
import spacy
from fuzzywuzzy import fuzz
import re
nlp = spacy.load("en_core_web_sm")
def extract_text_from_url(url):
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
return soup.get_text()
def extract_entities(text):
doc = nlp(text)
entities = [(ent.text, ent.label_) for ent in doc.ents]
return entities
def compare_citation(extracted_entities, expected_citation):
extracted_authors = [ent[0] for ent in extracted_entities if ent[1] == 'PERSON']
extracted_title = [ent[0] for ent in extracted_entities if ent[1] == 'WORK_OF_ART']
score_authors = fuzz.token_sort_ratio(" ".join(extracted_authors), expected_citation['authors'])
score_title = fuzz.token_sort_ratio(" ".join(extracted_title), expected_citation['title'])
return score_authors, score_title
def extract_authors_and_titles(citation,title):
pattern = r"^(.*?)(?=\d)"
match = re.match(pattern, citation)
authors = match.group(1).strip()
expected_citation = {
'authors': authors,
'title': title
}
return expected_citation
def verify_citations(url, citation, title):
expected_citation=extract_authors_and_titles(citation,title)
text = extract_text_from_url(url)
entities = extract_entities(text)
score_authors, score_title = compare_citation(entities, expected_citation)
if score_authors > 60 and score_title > 60:
return 'Yes'
else:
return 'No'