brevetnlpv2
Science Score: 31.0%
This score indicates how likely this project is to be science-related based on various indicators:
-
✓CITATION.cff file
Found CITATION.cff file -
✓codemeta.json file
Found codemeta.json file -
○.zenodo.json file
-
○DOI references
-
○Academic publication links
-
○Academic email domains
-
○Institutional organization owner
-
○JOSS paper metadata
-
○Scientific vocabulary similarity
Unable to calculate vocabulary similarity
Last synced: 10 months ago
·
JSON representation
·
Repository
Basic Info
- Host: GitHub
- Owner: edgarLan
- Language: Jupyter Notebook
- Default Branch: main
- Size: 34.9 MB
Statistics
- Stars: 0
- Watchers: 1
- Forks: 0
- Open Issues: 0
- Releases: 0
Created over 1 year ago
· Last pushed about 1 year ago
Metadata Files
Readme
Citation
README.md
brevetNLPv2
Owner
- Login: edgarLan
- Kind: user
- Repositories: 1
- Profile: https://github.com/edgarLan
Citation (citations/a3k_explo.ipynb)
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# pip install --use-pep517 alexandria3k"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"from alexandria3k.data_sources import uspto"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"uspto_directory = \"C:/Users/edgar/OneDrive/Bureau/Ecole/HEC/A24/BrevetNLP/citations_data/patentsView\""
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<alexandria3k.data_sources.uspto.Uspto at 0x265e792be50>"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# uspto.Uspto(uspto_directory=uspto_directory)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"# uspto.Uspto(uspto_directory=uspto_directory).download(data_location=uspto_directory)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"uspto_data = uspto.Uspto(uspto_directory)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"from tqdm import tqdm"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"database_path = \"uspto_data.db\"\n",
"uspto_data.populate(database_path, columns=[\"usp_citations.*\"], )"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" patent_id patcit_doc_number patcit_country patcit_kind patcit_date \\\n",
"0 0 D11495 US S 18791100 \n",
"1 0 D50715 US S 19170500 \n",
"2 0 D74119 US S 19271200 \n",
"3 0 D119611 US S 19400300 \n",
"4 0 3140954 US A 19640700 \n",
"\n",
" category \n",
"0 cited by other \n",
"1 cited by other \n",
"2 cited by examiner \n",
"3 cited by examiner \n",
"4 cited by other \n"
]
}
],
"source": [
"import sqlite3\n",
"import pandas as pd\n",
"\n",
"conn = sqlite3.connect(database_path)\n",
"cursor = conn.cursor()\n",
"\n",
"query = \"\"\"\n",
"SELECT\n",
" patent_id, patcit_doc_number, patcit_country, patcit_kind, patcit_date, category\n",
"FROM usp_citations\n",
"\"\"\"\n",
"\n",
"# query = \"\"\"\n",
"# SELECT\n",
"# patent_id, patcit_doc_number, patcit_country, patcit_kind, patcit_date, category,\n",
"# date_published\n",
"# FROM usp_citations\n",
"# JOIN us_patents ON usp_citations.patent_id = us_patents.id;\n",
"# \"\"\"\n",
"\n",
"# Execute the query\n",
"cursor.execute(query)\n",
"\n",
"# Fetch the results\n",
"results = cursor.fetchall()\n",
"\n",
"# Get the column names (this can be useful for the DataFrame)\n",
"columns = [description[0] for description in cursor.description]\n",
"\n",
"# Create a DataFrame from the results\n",
"df = pd.DataFrame(results, columns=columns)\n",
"\n",
"# Close the database connection\n",
"conn.close()\n",
"\n",
"# Now you can work with the DataFrame (df)\n",
"print(df.head()) # Print the first few rows of the DataFrame"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0 18791100\n",
"1 19170500\n",
"2 19271200\n",
"3 19400300\n",
"4 19640700\n",
" ... \n",
"196060 20070300\n",
"196061 20070400\n",
"196062 19881100\n",
"196063 20020500\n",
"196064 None\n",
"Name: patcit_date, Length: 196065, dtype: object"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df[\"patcit_date\"]"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"usp_citations\n",
"us_patents\n"
]
}
],
"source": [
"conn = sqlite3.connect(database_path) # Replace with your database path\n",
"\n",
"# Create a cursor object\n",
"cursor = conn.cursor()\n",
"\n",
"# Query to retrieve all table names\n",
"query = \"\"\"\n",
"SELECT name FROM sqlite_master WHERE type='table';\n",
"\"\"\"\n",
"\n",
"# Execute the query\n",
"cursor.execute(query)\n",
"\n",
"# Fetch all the results (table names)\n",
"tables = cursor.fetchall()\n",
"\n",
"# Print the table names\n",
"for table in tables:\n",
" print(table[0]) # Each row is a tuple, so we access the first element (table name)\n",
" \n",
"# Close the connection\n",
"conn.close()"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"database_path = \"uspto_data.db\"\n",
"\n",
"# Define the condition for the JOIN operation\n",
"condition = \"\"\"\n",
"usp_citations.patent_id = us_patents.id\n",
"\"\"\"\n",
"\n",
"# Define the columns you want to populate\n",
"columns = [\"usp_citations.*\", \"us_patents.*\"]\n",
"\n",
"# Call the populate function\n",
"uspto_data.populate(database_path, columns=columns, condition=condition)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" patent_id patcit_doc_number patcit_country patcit_kind patcit_date \\\n",
"0 0 20120103 US S 18791100 \n",
"1 0 20120103 US S 19170500 \n",
"2 0 20120103 US S 19271200 \n",
"3 0 20120103 US S 19400300 \n",
"4 0 20120103 US A 19640700 \n",
"\n",
" category date_published \n",
"0 cited by other 20120103 \n",
"1 cited by other 20120103 \n",
"2 cited by examiner 20120103 \n",
"3 cited by examiner 20120103 \n",
"4 cited by other 20120103 \n"
]
}
],
"source": [
"import sqlite3\n",
"import pandas as pd\n",
"\n",
"conn = sqlite3.connect(database_path)\n",
"cursor = conn.cursor()\n",
"\n",
"# query = \"\"\"\n",
"# SELECT\n",
"# patent_id, patcit_doc_number, patcit_country, patcit_kind, patcit_date, category\n",
"# FROM usp_citations\n",
"# \"\"\"\n",
"\n",
"query = \"\"\"\n",
"SELECT\n",
" patent_id, date_published patcit_doc_number, patcit_country, patcit_kind, patcit_date, category,\n",
" date_published\n",
"FROM usp_citations\n",
"JOIN us_patents ON usp_citations.patent_id = us_patents.id;\n",
"\"\"\"\n",
"\n",
"# Execute the query\n",
"cursor.execute(query)\n",
"\n",
"# Fetch the results\n",
"results = cursor.fetchall()\n",
"\n",
"# Get the column names (this can be useful for the DataFrame)\n",
"columns = [description[0] for description in cursor.description]\n",
"\n",
"# Create a DataFrame from the results\n",
"df = pd.DataFrame(results, columns=columns)\n",
"\n",
"# Close the database connection\n",
"conn.close()\n",
"\n",
"# Now you can work with the DataFrame (df)\n",
"print(df.head()) # Print the first few rows of the DataFrame"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>patent_id</th>\n",
" <th>patcit_doc_number</th>\n",
" <th>patcit_country</th>\n",
" <th>patcit_kind</th>\n",
" <th>patcit_date</th>\n",
" <th>category</th>\n",
" <th>date_published</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0</td>\n",
" <td>20120103</td>\n",
" <td>US</td>\n",
" <td>S</td>\n",
" <td>18791100</td>\n",
" <td>cited by other</td>\n",
" <td>20120103</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>0</td>\n",
" <td>20120103</td>\n",
" <td>US</td>\n",
" <td>S</td>\n",
" <td>19170500</td>\n",
" <td>cited by other</td>\n",
" <td>20120103</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>0</td>\n",
" <td>20120103</td>\n",
" <td>US</td>\n",
" <td>S</td>\n",
" <td>19271200</td>\n",
" <td>cited by examiner</td>\n",
" <td>20120103</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>0</td>\n",
" <td>20120103</td>\n",
" <td>US</td>\n",
" <td>S</td>\n",
" <td>19400300</td>\n",
" <td>cited by examiner</td>\n",
" <td>20120103</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>0</td>\n",
" <td>20120103</td>\n",
" <td>US</td>\n",
" <td>A</td>\n",
" <td>19640700</td>\n",
" <td>cited by other</td>\n",
" <td>20120103</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11733475</th>\n",
" <td>277283</td>\n",
" <td>20121225</td>\n",
" <td>JP</td>\n",
" <td>A</td>\n",
" <td>20010400</td>\n",
" <td>cited by other</td>\n",
" <td>20121225</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11733476</th>\n",
" <td>277283</td>\n",
" <td>20121225</td>\n",
" <td>JP</td>\n",
" <td>A</td>\n",
" <td>20051000</td>\n",
" <td>cited by other</td>\n",
" <td>20121225</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11733477</th>\n",
" <td>277283</td>\n",
" <td>20121225</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>cited by other</td>\n",
" <td>20121225</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11733478</th>\n",
" <td>277283</td>\n",
" <td>20121225</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>cited by other</td>\n",
" <td>20121225</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11733479</th>\n",
" <td>277283</td>\n",
" <td>20121225</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>cited by other</td>\n",
" <td>20121225</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>11733480 rows × 7 columns</p>\n",
"</div>"
],
"text/plain": [
" patent_id patcit_doc_number patcit_country patcit_kind patcit_date \\\n",
"0 0 20120103 US S 18791100 \n",
"1 0 20120103 US S 19170500 \n",
"2 0 20120103 US S 19271200 \n",
"3 0 20120103 US S 19400300 \n",
"4 0 20120103 US A 19640700 \n",
"... ... ... ... ... ... \n",
"11733475 277283 20121225 JP A 20010400 \n",
"11733476 277283 20121225 JP A 20051000 \n",
"11733477 277283 20121225 None None None \n",
"11733478 277283 20121225 None None None \n",
"11733479 277283 20121225 None None None \n",
"\n",
" category date_published \n",
"0 cited by other 20120103 \n",
"1 cited by other 20120103 \n",
"2 cited by examiner 20120103 \n",
"3 cited by examiner 20120103 \n",
"4 cited by other 20120103 \n",
"... ... ... \n",
"11733475 cited by other 20121225 \n",
"11733476 cited by other 20121225 \n",
"11733477 cited by other 20121225 \n",
"11733478 cited by other 20121225 \n",
"11733479 cited by other 20121225 \n",
"\n",
"[11733480 rows x 7 columns]"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
},
{
"ename": "",
"evalue": "",
"output_type": "error",
"traceback": [
"\u001b[1;31mThe Kernel crashed while executing code in the current cell or a previous cell. \n",
"\u001b[1;31mPlease review the code in the cell(s) to identify a possible cause of the failure. \n",
"\u001b[1;31mClick <a href='https://aka.ms/vscodeJupyterKernelCrash'>here</a> for more info. \n",
"\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
]
}
],
"source": [
"df"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(11733480, 7)"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.shape"
]
},
{
"cell_type": "code",
"execution_count": 43,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>patcit_doc_number</th>\n",
" <th>patcit_country</th>\n",
" <th>patcit_kind</th>\n",
" <th>patcit_date</th>\n",
" <th>category</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>D11495</td>\n",
" <td>US</td>\n",
" <td>S</td>\n",
" <td>18791100</td>\n",
" <td>cited by other</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>D50715</td>\n",
" <td>US</td>\n",
" <td>S</td>\n",
" <td>19170500</td>\n",
" <td>cited by other</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>D74119</td>\n",
" <td>US</td>\n",
" <td>S</td>\n",
" <td>19271200</td>\n",
" <td>cited by examiner</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>D119611</td>\n",
" <td>US</td>\n",
" <td>S</td>\n",
" <td>19400300</td>\n",
" <td>cited by examiner</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>3140954</td>\n",
" <td>US</td>\n",
" <td>A</td>\n",
" <td>19640700</td>\n",
" <td>cited by other</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11733475</th>\n",
" <td>2001-108601</td>\n",
" <td>JP</td>\n",
" <td>A</td>\n",
" <td>20010400</td>\n",
" <td>cited by other</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11733476</th>\n",
" <td>2005-283538</td>\n",
" <td>JP</td>\n",
" <td>A</td>\n",
" <td>20051000</td>\n",
" <td>cited by other</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11733477</th>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>cited by other</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11733478</th>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>cited by other</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11733479</th>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>cited by other</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>11733480 rows × 5 columns</p>\n",
"</div>"
],
"text/plain": [
" patcit_doc_number patcit_country patcit_kind patcit_date \\\n",
"0 D11495 US S 18791100 \n",
"1 D50715 US S 19170500 \n",
"2 D74119 US S 19271200 \n",
"3 D119611 US S 19400300 \n",
"4 3140954 US A 19640700 \n",
"... ... ... ... ... \n",
"11733475 2001-108601 JP A 20010400 \n",
"11733476 2005-283538 JP A 20051000 \n",
"11733477 None None None None \n",
"11733478 None None None None \n",
"11733479 None None None None \n",
"\n",
" category \n",
"0 cited by other \n",
"1 cited by other \n",
"2 cited by examiner \n",
"3 cited by examiner \n",
"4 cited by other \n",
"... ... \n",
"11733475 cited by other \n",
"11733476 cited by other \n",
"11733477 cited by other \n",
"11733478 cited by other \n",
"11733479 cited by other \n",
"\n",
"[11733480 rows x 5 columns]"
]
},
"execution_count": 43,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"0it [00:00, ?it/s]\n"
]
},
{
"ename": "ValueError",
"evalue": "not enough values to unpack (expected 1, got 0)",
"output_type": "error",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mValueError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[1;32mIn[23], line 7\u001b[0m\n\u001b[0;32m 1\u001b[0m query \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\"\"\u001b[39m\n\u001b[0;32m 2\u001b[0m \u001b[38;5;124mSELECT\u001b[39m\n\u001b[0;32m 3\u001b[0m \u001b[38;5;124m patent_id, patcit_doc_number, patcit_country, patcit_kind, patcit_date, category\u001b[39m\n\u001b[0;32m 4\u001b[0m \u001b[38;5;124mFROM usp_citations\u001b[39m\n\u001b[0;32m 5\u001b[0m \u001b[38;5;124m\"\"\"\u001b[39m\n\u001b[0;32m 6\u001b[0m i\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m0\u001b[39m\n\u001b[1;32m----> 7\u001b[0m \u001b[38;5;28;43;01mfor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mrow\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01min\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mtqdm\u001b[49m\u001b[43m(\u001b[49m\u001b[43muspto_data\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mquery\u001b[49m\u001b[43m(\u001b[49m\u001b[43mquery\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\u001b[43m:\u001b[49m\n\u001b[0;32m 8\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mprint\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mrow\u001b[49m\u001b[43m)\u001b[49m\n",
"File \u001b[1;32mc:\\Users\\edgar\\OneDrive\\Bureau\\Ecole\\HEC\\A24\\BrevetNLP\\.conda\\Lib\\site-packages\\tqdm\\std.py:1181\u001b[0m, in \u001b[0;36mtqdm.__iter__\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 1178\u001b[0m time \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_time\n\u001b[0;32m 1180\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m-> 1181\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43;01mfor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mobj\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01min\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43miterable\u001b[49m\u001b[43m:\u001b[49m\n\u001b[0;32m 1182\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43;01myield\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mobj\u001b[49m\n\u001b[0;32m 1183\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;66;43;03m# Update and possibly print the progressbar.\u001b[39;49;00m\n\u001b[0;32m 1184\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;66;43;03m# Note: does not call self.update(1) for speed optimisation.\u001b[39;49;00m\n",
"File \u001b[1;32mc:\\Users\\edgar\\OneDrive\\Bureau\\Ecole\\HEC\\A24\\BrevetNLP\\.conda\\Lib\\site-packages\\alexandria3k\\data_source.py:706\u001b[0m, in \u001b[0;36mDataSource.query\u001b[1;34m(self, query, partition)\u001b[0m\n\u001b[0;32m 704\u001b[0m \u001b[38;5;66;03m# Easy case\u001b[39;00m\n\u001b[0;32m 705\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m partition:\n\u001b[1;32m--> 706\u001b[0m \u001b[38;5;28;01myield from\u001b[39;00m \u001b[43mtry_sql_execute\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcursor\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mquery\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 707\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m\n\u001b[0;32m 709\u001b[0m \u001b[38;5;66;03m# Even when restricting multiple JOINs with container_id\u001b[39;00m\n\u001b[0;32m 710\u001b[0m \u001b[38;5;66;03m# SQLite seems to scan all containers for each JOIN making the\u001b[39;00m\n\u001b[0;32m 711\u001b[0m \u001b[38;5;66;03m# performance intolerably slow. Address this by creating non-virtual\u001b[39;00m\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 719\u001b[0m \u001b[38;5;66;03m# Run query on in-memory database\u001b[39;00m\n\u001b[0;32m 720\u001b[0m \u001b[38;5;66;03m# drop tables\u001b[39;00m\n",
"File \u001b[1;32mc:\\Users\\edgar\\OneDrive\\Bureau\\Ecole\\HEC\\A24\\BrevetNLP\\.conda\\Lib\\site-packages\\alexandria3k\\common.py:200\u001b[0m, in \u001b[0;36mtry_sql_execute\u001b[1;34m(execution_context, statement)\u001b[0m\n\u001b[0;32m 187\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[0;32m 188\u001b[0m \u001b[38;5;124;03mReturn the result of executing the specified SQL statement.\u001b[39;00m\n\u001b[0;32m 189\u001b[0m \u001b[38;5;124;03mThe statement is logged through log_sql. If the satement's\u001b[39;00m\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 197\u001b[0m \u001b[38;5;124;03m:type statement: str\u001b[39;00m\n\u001b[0;32m 198\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[0;32m 199\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m--> 200\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mexecution_context\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mexecute\u001b[49m\u001b[43m(\u001b[49m\u001b[43mlog_sql\u001b[49m\u001b[43m(\u001b[49m\u001b[43mstatement\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 201\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m apsw\u001b[38;5;241m.\u001b[39mSQLError \u001b[38;5;28;01mas\u001b[39;00m exception:\n\u001b[0;32m 202\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m Alexandria3kError(\n\u001b[0;32m 203\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mSQL statement \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mstatement\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m failed: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mexception\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 204\u001b[0m ) \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mexception\u001b[39;00m\n",
"File \u001b[1;32mD:\\a\\apsw\\apsw\\src\\vtable.c:2350\u001b[0m, in \u001b[0;36mVirtualTable.xFilter\u001b[1;34m()\u001b[0m\n",
"File \u001b[1;32mc:\\Users\\edgar\\OneDrive\\Bureau\\Ecole\\HEC\\A24\\BrevetNLP\\.conda\\Lib\\site-packages\\alexandria3k\\data_source.py:220\u001b[0m, in \u001b[0;36mElementsCursor.Filter\u001b[1;34m(self, *args)\u001b[0m\n\u001b[0;32m 217\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mFilter\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;241m*\u001b[39margs):\n\u001b[0;32m 218\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"Always called first to initialize an iteration to the first row\u001b[39;00m\n\u001b[0;32m 219\u001b[0m \u001b[38;5;124;03m of the table\"\"\"\u001b[39;00m\n\u001b[1;32m--> 220\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mparent_cursor\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mFilter\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 221\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39melements \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[0;32m 222\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mNext()\n",
"File \u001b[1;32mc:\\Users\\edgar\\OneDrive\\Bureau\\Ecole\\HEC\\A24\\BrevetNLP\\.conda\\Lib\\site-packages\\alexandria3k\\data_sources\\uspto.py:369\u001b[0m, in \u001b[0;36mPatentsCursor.Filter\u001b[1;34m(self, index_number, _index_name, constraint_args)\u001b[0m\n\u001b[0;32m 366\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mFilter\u001b[39m(\u001b[38;5;28mself\u001b[39m, index_number, _index_name, constraint_args):\n\u001b[0;32m 367\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"Always called first to initialize an iteration to the first row\u001b[39;00m\n\u001b[0;32m 368\u001b[0m \u001b[38;5;124;03m of the table according to the index\"\"\"\u001b[39;00m\n\u001b[1;32m--> 369\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfiles_cursor\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mFilter\u001b[49m\u001b[43m(\u001b[49m\u001b[43mindex_number\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m_index_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mconstraint_args\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 370\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39meof \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mfiles_cursor\u001b[38;5;241m.\u001b[39mEof()\n\u001b[0;32m 371\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m index_number \u001b[38;5;241m&\u001b[39m ROWID_INDEX:\n\u001b[0;32m 372\u001b[0m \u001b[38;5;66;03m# This has never happened, so this is untested\u001b[39;00m\n",
"File \u001b[1;32mc:\\Users\\edgar\\OneDrive\\Bureau\\Ecole\\HEC\\A24\\BrevetNLP\\.conda\\Lib\\site-packages\\alexandria3k\\data_sources\\uspto.py:260\u001b[0m, in \u001b[0;36mPatentsFilesCursor.Filter\u001b[1;34m(self, index_number, _index_name, constraint_args)\u001b[0m\n\u001b[0;32m 256\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcontainer_id \u001b[38;5;241m=\u001b[39m constraint_args[\u001b[38;5;241m0\u001b[39m] \u001b[38;5;241m-\u001b[39m \u001b[38;5;241m1\u001b[39m\n\u001b[0;32m 257\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcurrent_file_path \u001b[38;5;241m=\u001b[39m (\n\u001b[0;32m 258\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtable\u001b[38;5;241m.\u001b[39mdata_source\u001b[38;5;241m.\u001b[39mget_current_zip_path()\n\u001b[0;32m 259\u001b[0m )\n\u001b[1;32m--> 260\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mNext\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n",
"File \u001b[1;32mc:\\Users\\edgar\\OneDrive\\Bureau\\Ecole\\HEC\\A24\\BrevetNLP\\.conda\\Lib\\site-packages\\alexandria3k\\data_sources\\uspto.py:270\u001b[0m, in \u001b[0;36mPatentsFilesCursor.Next\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 268\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcontainer_id \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m1\u001b[39m\n\u001b[0;32m 269\u001b[0m \u001b[38;5;66;03m# Zip file read.\u001b[39;00m\n\u001b[1;32m--> 270\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mxml_contents \u001b[38;5;241m=\u001b[39m \u001b[43mget_zip_cache\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mread\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 271\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcurrent_file_path\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtable\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdata_source\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msample\u001b[49m\n\u001b[0;32m 272\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 274\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcontainer_id \u001b[38;5;241m>\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;28mlen\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mxml_contents):\n\u001b[0;32m 275\u001b[0m \u001b[38;5;66;03m# Zip file ended.\u001b[39;00m\n\u001b[0;32m 276\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m (\n\u001b[0;32m 277\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtable\u001b[38;5;241m.\u001b[39mdata_source\u001b[38;5;241m.\u001b[39mlength_of_zip_files()\n\u001b[0;32m 278\u001b[0m \u001b[38;5;241m>\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mzip_index \u001b[38;5;241m+\u001b[39m \u001b[38;5;241m1\u001b[39m\n\u001b[0;32m 279\u001b[0m ):\n\u001b[0;32m 280\u001b[0m \u001b[38;5;66;03m# Moving to the next available Zip file.\u001b[39;00m\n\u001b[0;32m 281\u001b[0m \u001b[38;5;66;03m# Updating new container id.\u001b[39;00m\n",
"File \u001b[1;32mc:\\Users\\edgar\\OneDrive\\Bureau\\Ecole\\HEC\\A24\\BrevetNLP\\.conda\\Lib\\site-packages\\alexandria3k\\uspto_zip_cache.py:58\u001b[0m, in \u001b[0;36mUsptoZipCache.read\u001b[1;34m(self, zip_path, sampling)\u001b[0m\n\u001b[0;32m 53\u001b[0m xml_file \u001b[38;5;241m=\u001b[39m [\n\u001b[0;32m 54\u001b[0m file \u001b[38;5;28;01mfor\u001b[39;00m file \u001b[38;5;129;01min\u001b[39;00m zip_ref\u001b[38;5;241m.\u001b[39mnamelist() \u001b[38;5;28;01mif\u001b[39;00m file\u001b[38;5;241m.\u001b[39mendswith(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m.xml\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m 55\u001b[0m ]\n\u001b[0;32m 57\u001b[0m \u001b[38;5;66;03m# Extract filename and decoding the XML\u001b[39;00m\n\u001b[1;32m---> 58\u001b[0m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mfile_name,) \u001b[38;5;241m=\u001b[39m xml_file\n\u001b[0;32m 59\u001b[0m xml_content \u001b[38;5;241m=\u001b[39m zip_ref\u001b[38;5;241m.\u001b[39mread(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mfile_name)\u001b[38;5;241m.\u001b[39mdecode(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mutf-8\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m 61\u001b[0m \u001b[38;5;66;03m# The first item of the list is None.\u001b[39;00m\n",
"\u001b[1;31mValueError\u001b[0m: not enough values to unpack (expected 1, got 0)"
]
}
],
"source": [
"query = \"\"\"\n",
"SELECT\n",
" patent_id, patcit_doc_number, patcit_country, patcit_kind, patcit_date, category\n",
"FROM usp_citations\n",
"\"\"\"\n",
"i=0\n",
"for row in tqdm(uspto_data.query(query)):\n",
" print(row)\n"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"0it [00:00, ?it/s]\n"
]
},
{
"ename": "ConnectionClosedError",
"evalue": "The connection has been closed",
"output_type": "error",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mConnectionClosedError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[1;32mIn[38], line 7\u001b[0m\n\u001b[0;32m 1\u001b[0m query \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\"\"\u001b[39m\n\u001b[0;32m 2\u001b[0m \u001b[38;5;124mSELECT\u001b[39m\n\u001b[0;32m 3\u001b[0m \u001b[38;5;124m patent_id, patcit_doc_number, patcit_country, patcit_kind, patcit_date, category\u001b[39m\n\u001b[0;32m 4\u001b[0m \u001b[38;5;124mFROM usp_citations\u001b[39m\n\u001b[0;32m 5\u001b[0m \u001b[38;5;124m\"\"\"\u001b[39m\n\u001b[0;32m 6\u001b[0m i\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m0\u001b[39m\n\u001b[1;32m----> 7\u001b[0m \u001b[38;5;28;43;01mfor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mrow\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01min\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mtqdm\u001b[49m\u001b[43m(\u001b[49m\u001b[43muspto_data\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mquery\u001b[49m\u001b[43m(\u001b[49m\u001b[43mquery\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\u001b[43m:\u001b[49m\n\u001b[0;32m 8\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mprint\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mrow\u001b[49m\u001b[43m)\u001b[49m\n",
"File \u001b[1;32mc:\\Users\\edgar\\OneDrive\\Bureau\\Ecole\\HEC\\A24\\BrevetNLP\\.conda\\Lib\\site-packages\\tqdm\\std.py:1181\u001b[0m, in \u001b[0;36mtqdm.__iter__\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 1178\u001b[0m time \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_time\n\u001b[0;32m 1180\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m-> 1181\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43;01mfor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mobj\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01min\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43miterable\u001b[49m\u001b[43m:\u001b[49m\n\u001b[0;32m 1182\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43;01myield\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mobj\u001b[49m\n\u001b[0;32m 1183\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;66;43;03m# Update and possibly print the progressbar.\u001b[39;49;00m\n\u001b[0;32m 1184\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;66;43;03m# Note: does not call self.update(1) for speed optimisation.\u001b[39;49;00m\n",
"File \u001b[1;32mc:\\Users\\edgar\\OneDrive\\Bureau\\Ecole\\HEC\\A24\\BrevetNLP\\.conda\\Lib\\site-packages\\alexandria3k\\data_source.py:702\u001b[0m, in \u001b[0;36mDataSource.query\u001b[1;34m(self, query, partition)\u001b[0m\n\u001b[0;32m 675\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mquery\u001b[39m(\u001b[38;5;28mself\u001b[39m, query, partition\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m):\n\u001b[0;32m 676\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[0;32m 677\u001b[0m \u001b[38;5;124;03m Run the specified query on the virtual database using the data\u001b[39;00m\n\u001b[0;32m 678\u001b[0m \u001b[38;5;124;03m specified in the object constructor's call.\u001b[39;00m\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 699\u001b[0m \u001b[38;5;124;03m :rtype: iterable\u001b[39;00m\n\u001b[0;32m 700\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[1;32m--> 702\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcursor \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mvdb\u001b[38;5;241m.\u001b[39mcursor()\n\u001b[0;32m 704\u001b[0m \u001b[38;5;66;03m# Easy case\u001b[39;00m\n\u001b[0;32m 705\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m partition:\n",
"\u001b[1;31mConnectionClosedError\u001b[0m: The connection has been closed"
]
}
],
"source": [
"query = \"\"\"\n",
"SELECT\n",
" patent_id, patcit_doc_number, patcit_country, patcit_kind, patcit_date, category\n",
"FROM usp_citations\n",
"\"\"\"\n",
"i=0\n",
"for row in tqdm(uspto_data.query(query)):\n",
" print(row)\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.10"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
GitHub Events
Total
- Push event: 32
- Create event: 2
Last Year
- Push event: 32
- Create event: 2