citation-prediction
Science Score: 18.0%
This score indicates how likely this project is to be science-related based on various indicators:
-
✓CITATION.cff file
Found CITATION.cff file -
○codemeta.json file
-
○.zenodo.json file
-
○DOI references
-
○Academic publication links
-
○Academic email domains
-
○Institutional organization owner
-
○JOSS paper metadata
-
○Scientific vocabulary similarity
Low similarity (2.5%) to scientific vocabulary
Last synced: 10 months ago
·
JSON representation
·
Repository
Basic Info
- Host: GitHub
- Owner: stogiannidis
- Language: Jupyter Notebook
- Default Branch: main
- Size: 6.84 KB
Statistics
- Stars: 0
- Watchers: 2
- Forks: 0
- Open Issues: 0
- Releases: 0
Created almost 4 years ago
· Last pushed almost 4 years ago
Metadata Files
Readme
Citation
README.md
Citation-Prediction
This notebook is an earlier draft of the final notebook. Due to limited time Iam not able to finish and upload the final notebook which contains state-of-the-art model like bert and comments to make understanding easier. As soon as I have more available time I am going to upload it and write a better readme file :D
Owner
- Name: Ilias M. Stogiannidis
- Login: stogiannidis
- Kind: user
- Location: Athens
- Company: @Helvia
- Website: https://stogiannidis.github.io/
- Twitter: istogiannidis
- Repositories: 19
- Profile: https://github.com/stogiannidis
Applied ML Researcher @ helvia.ai Interested in various ML domain see my website for more
Citation (citation_prediction.ipynb)
{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"import networkx as nx\n",
"import gensim\n",
"import csv\n",
"import numpy as np\n",
"from random import randint\n",
"from nltk.tokenize import word_tokenize, sent_tokenize\n",
"from nltk.stem import WordNetLemmatizer\n",
"from sklearn.linear_model import LogisticRegression\n",
"from sklearn.utils import shuffle\n",
"import nltk\n",
"from gensim.models import Word2Vec\n",
"from gensim.models import Doc2Vec\n",
"from collections import Counter\n",
"from nltk.corpus import stopwords\n",
"from sklearn.metrics.pairwise import cosine_similarity as cosim"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"import warnings\n",
"warnings.filterwarnings(\"ignore\")"
]
},
{
"cell_type": "code",
"execution_count": 47,
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Number of nodes: 138499\n",
"Number of edges: 1091955\n"
]
}
],
"source": [
"# Create a graph\n",
"G = nx.read_edgelist('edgelist.txt', delimiter=',', create_using=nx.Graph(), nodetype=int)\n",
"diG = nx.read_edgelist('edgelist.txt', delimiter=',', create_using=nx.DiGraph(), nodetype=int)\n",
"nodes = list(G.nodes())\n",
"n = G.number_of_nodes()\n",
"m = G.number_of_edges()\n",
"print('Number of nodes:', n)\n",
"print('Number of edges:', m)\n"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"n2v = Word2Vec.load('n2v_model.model')"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"# Read the abstract of each paper\n",
"abstracts = dict()\n",
"with open('abstracts.txt', 'r', encoding='utf-8') as f:\n",
" for line in f:\n",
" node, abstract = line.split('|--|')\n",
" abstracts[int(node)] = abstract"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"# Read the authors of each paper\n",
"authors = dict()\n",
"with open('authors.txt', 'r', encoding='utf-8') as f:\n",
" for line in f:\n",
" node, author = line.split('|--|')\n",
" authors[int(node)] = author"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"stop_words = set(stopwords.words('english'))\n",
"stop_words.update(['.', ',', '\"', \"'\", '?', '!', ':', ';', '(', ')', '[', ']', '{', '}',\"'d\", \"'ll\", \"'re\", \"'s\", \"'ve\", '``', 'could', 'might', 'must', \"n't\", 'need', 'sha', 'wo', 'would','”','“'])"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"lemmatizer = WordNetLemmatizer()"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"tokenized_authors = dict()\n",
"for node in authors:\n",
" temp = authors[node].split(',')\n",
" tokenized_authors[node] = []\n",
" for author in temp:\n",
" tokenized_authors[node].append(author.strip())"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"tokenized_abstracts = dict()\n",
"for node in abstracts:\n",
" tokenized_abstracts[node] = []\n",
" for sent in sent_tokenize(abstracts[node]):\n",
" for i in word_tokenize(sent):\n",
" word = i.lower()\n",
" if word in stop_words:\n",
" continue\n",
" else:\n",
" tokenized_abstracts[node].append(lemmatizer.lemmatize(word))"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"for node in abstracts:\n",
" abstracts[node] = set(tokenized_abstracts[node])"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"for node in authors:\n",
" authors[node] = set(tokenized_authors[node])"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"import math\n",
"epsilon = 1e-6\n",
"def counter_cosine_similarity(c1, c2):\n",
" terms = set(c1).union(c2)\n",
" dotprod = sum(c1.get(k, 0) * c2.get(k, 0) for k in terms)\n",
" magA = math.sqrt(sum(c1.get(k, 0)**2 for k in terms))\n",
" magB = math.sqrt(sum(c2.get(k, 0)**2 for k in terms))\n",
" return dotprod / ((magA * magB) + epsilon)"
]
},
{
"cell_type": "markdown",
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%% md\n"
}
},
"source": [
"Calculating the cosine similarity between the similar words of the abstracts\n",
"The vectors represent the frequency of the words in each abstracts"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"import igraph as ig\n",
"ig_G = ig.Graph.from_networkx(G)"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"cluster = nx.clustering(G)"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"rank = nx.pagerank(G)"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"h,a = nx.hits(G)"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"triangles = nx.triangles(G)"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"model = Word2Vec(abstracts.values(), window=20, min_count=1, workers=-1,sg=1)"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"model_authors = Word2Vec(authors.values(), window=5, min_count=1, workers=-1,sg=1)"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [],
"source": [
"#import taggeddocument\n",
"from gensim.models.doc2vec import TaggedDocument\n",
"from tqdm import tqdm"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"docs = [TaggedDocument(words=tokenized_abstracts[node], tags=[node]) for node in abstracts]\n",
"ath_doca = [TaggedDocument(words=tokenized_authors[node], tags=[node]) for node in authors]\n",
"d2v = Doc2Vec(docs, vector_size=100, window=10, min_count=1, workers= -1)\n",
"d2v_ath = Doc2Vec(ath_doca, vector_size=100, window=5, min_count=1, workers= -1)"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"from scipy.spatial.distance import cosine"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"for node in tokenized_abstracts:\n",
" if tokenized_abstracts[node] == []:\n",
" tokenized_abstracts[node] = ['none']"
]
},
{
"cell_type": "code",
"execution_count": 48,
"metadata": {},
"outputs": [],
"source": [
"bet = ig_G.betweenness(directed=False, cutoff=5)"
]
},
{
"cell_type": "code",
"execution_count": 50,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"205924.46228046715"
]
},
"execution_count": 50,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"bet[1]"
]
},
{
"cell_type": "code",
"execution_count": 52,
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Training: 100%|██████████| 1091955/1091955 [45:33<00:00, 399.54it/s] \n"
]
}
],
"source": [
"# its class label is 1 if it corresponds to an edge and 0, otherwise.\n",
"# Use the following 3 features for each pair of nodes:\n",
"# (1) sum of number of unique terms of the two nodes' abstracts\n",
"# (2) absolute value of difference of number of unique terms of the two nodes' abstracts\n",
"# (3) number of common terms between the abstracts of the two nodes\n",
"# (4) sum of number of unique terms of the two nodes' authors\n",
"# (5) absolute value of difference of number of unique terms of the two nodes' authors\n",
"# (6) \n",
"\n",
"X_train = np.zeros((2*m, 33))\n",
"y_train = np.zeros(2*m)\n",
"n = G.number_of_nodes()\n",
"for i,edge in tqdm(enumerate(G.edges()), desc='Training', total=m):\n",
" # an edge\n",
" X_train[i,0] = len(abstracts[edge[0]]) + len(abstracts[edge[1]])\n",
" X_train[i,1] = abs(len(abstracts[edge[0]]) - len(abstracts[edge[1]]))\n",
" X_train[i,2] = len(abstracts[edge[0]].intersection(abstracts[edge[1]]))\n",
" X_train[i,3] = len(authors[edge[0]]) + len(authors[edge[1]])\n",
" X_train[i,4] = abs(len(authors[edge[0]]) - len(authors[edge[1]]))\n",
" X_train[i,5] = len(authors[edge[0]].intersection(authors[edge[1]]))\n",
" X_train[i,6] = counter_cosine_similarity(Counter(tokenized_authors[edge[0]]), Counter(tokenized_authors[edge[1]]))\n",
" X_train[i,7] = counter_cosine_similarity(Counter(tokenized_abstracts[edge[0]]), Counter(tokenized_abstracts[edge[1]]))\n",
" X_train[i,8] = rank[edge[0]] + rank[edge[1]]\n",
" X_train[i,9] = abs(rank[edge[0]] - rank[edge[1]])\n",
" X_train[i,10] = cluster[edge[0]] + cluster[edge[1]]\n",
" X_train[i,11] = abs(cluster[edge[0]] - cluster[edge[1]])\n",
" X_train[i,12] = h[edge[0]] + h[edge[1]]\n",
" X_train[i,13] = abs(h[edge[0]] - h[edge[1]])\n",
" X_train[i,14] = triangles[edge[0]] + triangles[edge[1]]\n",
" X_train[i,15] = abs(triangles[edge[0]] - triangles[edge[1]])\n",
" X_train[i,16] = nx.degree(G, edge[0]) + nx.degree(G, edge[1])\n",
" X_train[i,17] = abs(nx.degree(G, edge[0]) - nx.degree(G, edge[1]))\n",
" X_train[i,18] = len(list(nx.common_neighbors(G, edge[0], edge[1])))\n",
" X_train[i,19] = n2v.wv.n_similarity(G[edge[0]], G[edge[1]])\n",
" X_train[i,20] = n2v.wv.similarity(edge[0], edge[1])\n",
" X_train[i,21] = model.wv.n_similarity(tokenized_abstracts[edge[0]], tokenized_abstracts[edge[1]])\n",
" X_train[i,22] = model_authors.wv.n_similarity(tokenized_authors[edge[0]], tokenized_authors[edge[1]])\n",
" X_train[i,23] = a[edge[0]] + a[edge[1]]\n",
" X_train[i,24] = abs(a[edge[0]] - a[edge[1]])\n",
" X_train[i,25] = cosine(d2v[edge[0]], d2v[edge[1]])\n",
" X_train[i,26] = cosine(d2v_ath[edge[0]], d2v_ath[edge[1]])\n",
" X_train[i,27] = diG.in_degree(edge[0]) + diG.in_degree(edge[1])\n",
" X_train[i,28] = abs(diG.in_degree(edge[0]) - diG.in_degree(edge[1]))\n",
" X_train[i,29] = diG.out_degree(edge[0]) + diG.out_degree(edge[1])\n",
" X_train[i,30] = abs(diG.out_degree(edge[0]) - diG.out_degree(edge[1]))\n",
" X_train[i,31] = bet[edge[0]] + bet[edge[1]]\n",
" X_train[i,32] = abs(bet[edge[0]] - bet[edge[1]])\n",
" y_train[i] = 1\n",
"\n",
" # a randomly generated pair of nodes\n",
" n1 = randint(0, n-1)\n",
" n2 = randint(0, n-1)\n",
" while G.has_edge(n1, n2):\n",
" n1 = randint(0, n-1)\n",
" n2 = randint(0, n-1)\n",
" X_train[m+i,0] = len(abstracts[n1]) + len(abstracts[n2])\n",
" X_train[m+i,1] = abs(len(abstracts[n1]) - len(abstracts[n2]))\n",
" X_train[m+i,2] = len(abstracts[n1].intersection(abstracts[n2]))\n",
" X_train[m+i,3] = len(authors[n1]) + len(authors[n2])\n",
" X_train[m+i,4] = abs(len(authors[n1]) - len(authors[n2]))\n",
" X_train[m+i,5] = len(authors[n1].intersection(authors[n2]))\n",
" X_train[m+i,6] = counter_cosine_similarity(Counter(tokenized_authors[n1]), Counter(tokenized_authors[n2]))\n",
" X_train[m+i,7] = counter_cosine_similarity(Counter(tokenized_abstracts[n1]), Counter(tokenized_abstracts[n2]))\n",
" X_train[m+i,8] = rank[n1] + rank[n2] # sum of ranks of the two nodes\n",
" X_train[m+i,9] = abs(rank[n1] - rank[n2]) # absolute value of difference of ranks of the two nodes\n",
" X_train[m+i,10] = cluster[n1] + cluster[n2] # sum of clusters of the two nodes\n",
" X_train[m+i,11] = abs(cluster[n1] - cluster[n2]) # absolute value of difference of clusters of the two nodes\n",
" X_train[m+i,12] = h[n1] + h[n2] # sum of hubs of the two nodes\n",
" X_train[m+i,13] = abs(h[n1] - h[n2]) # absolute value of difference of hubs of the two nodes\n",
" X_train[m+i,14] = triangles[n1] + triangles[n2] # sum of triangles of the two nodes\n",
" X_train[m+i,15] = abs(triangles[n1] - triangles[n2]) # absolute value of difference of triangles of the two nodes\n",
" X_train[m+i,16] = nx.degree(G, n1) + nx.degree(G, n2) # sum of degrees of the two nodes\n",
" X_train[m+i,17] = abs(nx.degree(G, n1) - nx.degree(G, n2)) # absolute value of difference of degrees of the two nodes\n",
" X_train[m+i,18] = len(list(nx.common_neighbors(G, n1, n2))) # number of common neighbors of the two nodes\n",
" X_train[m+i,19] = n2v.wv.n_similarity(G[n1], G[n2]) # cosine similarity between the two nodes\n",
" X_train[m+i,20] = n2v.wv.similarity(n1, n2) # cosine similarity between the two nodes\n",
" X_train[m+i,21] = model.wv.n_similarity(tokenized_abstracts[n1], tokenized_abstracts[n2]) # cosine similarity between the two nodes\n",
" X_train[m+i,22] = model_authors.wv.n_similarity(tokenized_authors[n1], tokenized_authors[n2]) # cosine similarity between the two nodes\n",
" X_train[m+i,23] = a[n1] + a[n2] # sum of authors of the two nodes\n",
" X_train[m+i,24] = abs(a[n1] - a[n2]) # absolute value of difference of authors of the two nodes\n",
" X_train[m+i,25] = cosine(d2v[n1], d2v[n2]) # cosine similarity between the two nodes\n",
" X_train[m+i,26] = cosine(d2v_ath[n1], d2v_ath[n2]) # cosine similarity between the two nodes\n",
" X_train[m+i,27] = diG.in_degree(n1) + diG.in_degree(n2) # sum of in-degrees of the two nodes\n",
" X_train[m+i,28] = abs(diG.in_degree(n1) - diG.in_degree(n2)) # absolute value of difference of in-degrees of the two nodes\n",
" X_train[m+i,29] = diG.out_degree(n1) + diG.out_degree(n2) # sum of out-degrees of the two nodes\n",
" X_train[m+i,30] = abs(diG.out_degree(n1) - diG.out_degree(n2)) # absolute value of difference of out-degrees of the two nodes\n",
" X_train[m+i,31] = bet[n1] + bet[n2] # sum of betweenness centrality of the two nodes\n",
" X_train[m+i,32] = abs(bet[n1] - bet[n2]) # absolute value of difference of betweenness centrality of the two nodesed \n",
" y_train[m+i] = 0"
]
},
{
"cell_type": "code",
"execution_count": 53,
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Size of training matrix: (2183910, 33)\n"
]
}
],
"source": [
"print('Size of training matrix:', X_train.shape)"
]
},
{
"cell_type": "code",
"execution_count": 54,
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"# Read test data. Each sample is a pair of nodes\n",
"node_pairs = list()\n",
"with open('test.txt', 'r') as f:\n",
" for line in f:\n",
" t = line.split(',')\n",
" node_pairs.append((int(t[0]), int(t[1])))"
]
},
{
"cell_type": "code",
"execution_count": 56,
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Creating test matrix: 100%|██████████| 106692/106692 [02:19<00:00, 767.22it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Size of test matrix: (106692, 33)\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\n"
]
}
],
"source": [
"# Create the test matrix. Use the same 4 features as above\n",
"X_test = np.zeros((len(node_pairs), 33))\n",
"for i,node_pair in tqdm(enumerate(node_pairs), desc='Creating test matrix', total=len(node_pairs)):\n",
" X_test[i,0] = len(abstracts[node_pair[0]]) + len(abstracts[node_pair[1]])\n",
" X_test[i,1] = abs(len(abstracts[node_pair[0]]) - len(abstracts[node_pair[1]]))\n",
" X_test[i,2] = len(abstracts[node_pair[0]].intersection(abstracts[node_pair[1]]))\n",
" X_test[i,3] = len(authors[node_pair[0]]) + len(authors[node_pair[1]])\n",
" X_test[i,4] = abs(len(authors[node_pair[0]]) - len(authors[node_pair[1]]))\n",
" X_test[i,5] = len(authors[node_pair[0]].intersection(authors[node_pair[1]]))\n",
" X_test[i,6] = counter_cosine_similarity(Counter(tokenized_authors[node_pair[0]]), Counter(tokenized_authors[node_pair[1]]))\n",
" X_test[i,7] = counter_cosine_similarity(Counter(tokenized_abstracts[node_pair[0]]), Counter(tokenized_abstracts[node_pair[1]]))\n",
" X_test[i,8] = rank[node_pair[0]] + rank[node_pair[1]]\n",
" X_test[i,9] = abs(rank[node_pair[0]] - rank[node_pair[1]])\n",
" X_test[i,10] = cluster[node_pair[0]] + cluster[node_pair[1]]\n",
" X_test[i,11] = abs(cluster[node_pair[0]] - cluster[node_pair[1]])\n",
" X_test[i,12] = h[node_pair[0]] + h[node_pair[1]]\n",
" X_test[i,13] = abs(h[node_pair[0]] - h[node_pair[1]])\n",
" X_test[i,14] = triangles[node_pair[0]] + triangles[node_pair[1]]\n",
" X_test[i,15] = abs(triangles[node_pair[0]] - triangles[node_pair[1]])\n",
" X_test[i,16] = nx.degree(G, node_pair[0]) + nx.degree(G, node_pair[1])\n",
" X_test[i,17] = abs(nx.degree(G, node_pair[0]) - nx.degree(G, node_pair[1]))\n",
" X_test[i,18] = len(list(nx.common_neighbors(G, node_pair[0], node_pair[1])))\n",
" X_test[i,19] = n2v.wv.n_similarity(G[node_pair[0]], G[node_pair[1]])\n",
" X_test[i,20] = n2v.wv.similarity(node_pair[0], node_pair[1])\n",
" X_test[i,21] = model.wv.n_similarity(tokenized_abstracts[node_pair[0]], tokenized_abstracts[node_pair[1]])\n",
" X_test[i,22] = model_authors.wv.n_similarity(tokenized_authors[node_pair[0]], tokenized_authors[node_pair[1]])\n",
" X_test[i,23] = a[node_pair[0]] + a[node_pair[1]]\n",
" X_test[i,24] = abs(a[node_pair[0]] - a[node_pair[1]])\n",
" X_test[i,25] = cosine(d2v[node_pair[0]], d2v[node_pair[1]])\n",
" X_test[i,26] = cosine(d2v_ath[node_pair[0]], d2v_ath[node_pair[1]])\n",
" X_test[i,27] = diG.in_degree(node_pair[0]) + diG.in_degree(node_pair[1])\n",
" X_test[i,28] = abs(diG.in_degree(node_pair[0]) - diG.in_degree(node_pair[1]))\n",
" X_test[i,29] = diG.out_degree(node_pair[0]) + diG.out_degree(node_pair[1])\n",
" X_test[i,30] = abs(diG.out_degree(node_pair[0]) - diG.out_degree(node_pair[1]))\n",
" X_test[i,31] = bet[node_pair[0]] + bet[node_pair[1]]\n",
" X_test[i,32] = abs(bet[node_pair[0]] - bet[node_pair[1]])\n",
"print('Size of test matrix:', X_test.shape)"
]
},
{
"cell_type": "code",
"execution_count": 57,
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"X_train, y_train = shuffle(X_train, y_train)"
]
},
{
"cell_type": "code",
"execution_count": 58,
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[LightGBM] [Warning] Unknown parameter: random_sate\n"
]
}
],
"source": [
"import lightgbm as lgb\n",
"\n",
"lb = lgb.LGBMClassifier(objective='binary', device='gpu',random_sate=47)\n",
"lb.fit(X_train, y_train)\n",
"\n",
"y_pred_lgb = lb.predict_proba(X_test)\n",
"y_pred_lgb = y_pred_lgb[:,1]"
]
},
{
"cell_type": "code",
"execution_count": 59,
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"# Write predictions to a file\n",
"predictions = zip(range(len(ypred)), ypred)\n",
"with open(\"submissions.csv\",\"w\") as pred:\n",
" csv_out = csv.writer(pred)\n",
" csv_out.writerow(['id','predicted'])\n",
" for row in predictions:\n",
" csv_out.writerow(row)"
]
}
],
"metadata": {
"interpreter": {
"hash": "142831d04f5b5b00c5675dbb014767191340933e002f313d1f5b93272cd1b868"
},
"kernelspec": {
"display_name": "Python 3.9.0 64-bit",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.0"
}
},
"nbformat": 4,
"nbformat_minor": 4
}