citation-prediction

https://github.com/stogiannidis/citation-prediction
Science Score: 18.0%

This score indicates how likely this project is to be science-related based on various indicators:
✓
CITATION.cff file
Found CITATION.cff file
○
codemeta.json file
○
.zenodo.json file
○
DOI references
○
Academic publication links
○
Academic email domains
○
Institutional organization owner
○
JOSS paper metadata
○
Scientific vocabulary similarity
Low similarity (2.5%) to scientific vocabulary
Last synced: 10 months ago · JSON representation ·
Repository

Basic Info

Host: GitHub
Owner: stogiannidis
Language: Jupyter Notebook
Default Branch: main
Size: 6.84 KB
Statistics

Stars: 0
Watchers: 2
Forks: 0
Open Issues: 0
Releases: 0
Created almost 4 years ago · Last pushed almost 4 years ago
Metadata Files

Readme Citation
Citation-Prediction

This notebook is an earlier draft of the final notebook. Due to limited time Iam not able to finish and upload the final notebook which contains state-of-the-art model like bert and comments to make understanding easier. As soon as I have more available time I am going to upload it and write a better readme file :D
Owner

Name: Ilias M. Stogiannidis
Login: stogiannidis
Kind: user
Location: Athens
Company: @Helvia
Website: https://stogiannidis.github.io/
Twitter: istogiannidis
Repositories: 19
Profile: https://github.com/stogiannidis
Applied ML Researcher @ helvia.ai Interested in various ML domain see my website for more
Citation (citation_prediction.ipynb)

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "import networkx as nx\n",
    "import gensim\n",
    "import csv\n",
    "import numpy as np\n",
    "from random import randint\n",
    "from nltk.tokenize import word_tokenize, sent_tokenize\n",
    "from nltk.stem import WordNetLemmatizer\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn.utils import shuffle\n",
    "import nltk\n",
    "from gensim.models import Word2Vec\n",
    "from gensim.models import Doc2Vec\n",
    "from collections import Counter\n",
    "from nltk.corpus import stopwords\n",
    "from sklearn.metrics.pairwise import cosine_similarity as cosim"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "import warnings\n",
    "warnings.filterwarnings(\"ignore\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of nodes: 138499\n",
      "Number of edges: 1091955\n"
     ]
    }
   ],
   "source": [
    "# Create a graph\n",
    "G = nx.read_edgelist('edgelist.txt', delimiter=',', create_using=nx.Graph(), nodetype=int)\n",
    "diG = nx.read_edgelist('edgelist.txt', delimiter=',', create_using=nx.DiGraph(), nodetype=int)\n",
    "nodes = list(G.nodes())\n",
    "n = G.number_of_nodes()\n",
    "m = G.number_of_edges()\n",
    "print('Number of nodes:', n)\n",
    "print('Number of edges:', m)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "n2v = Word2Vec.load('n2v_model.model')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "# Read the abstract of each paper\n",
    "abstracts = dict()\n",
    "with open('abstracts.txt', 'r', encoding='utf-8') as f:\n",
    "    for line in f:\n",
    "        node, abstract = line.split('|--|')\n",
    "        abstracts[int(node)] = abstract"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "# Read the authors of each paper\n",
    "authors = dict()\n",
    "with open('authors.txt', 'r', encoding='utf-8') as f:\n",
    "    for line in f:\n",
    "        node, author = line.split('|--|')\n",
    "        authors[int(node)] = author"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "stop_words = set(stopwords.words('english'))\n",
    "stop_words.update(['.', ',', '\"', \"'\", '?', '!', ':', ';', '(', ')', '[', ']', '{', '}',\"'d\", \"'ll\", \"'re\", \"'s\", \"'ve\", '``', 'could', 'might', 'must', \"n't\", 'need', 'sha', 'wo', 'would','”','“'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "lemmatizer = WordNetLemmatizer()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "tokenized_authors = dict()\n",
    "for node in authors:\n",
    "    temp = authors[node].split(',')\n",
    "    tokenized_authors[node] = []\n",
    "    for author in temp:\n",
    "        tokenized_authors[node].append(author.strip())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "tokenized_abstracts = dict()\n",
    "for node in abstracts:\n",
    "    tokenized_abstracts[node] = []\n",
    "    for sent in sent_tokenize(abstracts[node]):\n",
    "        for i in word_tokenize(sent):\n",
    "            word = i.lower()\n",
    "            if word in stop_words:\n",
    "                continue\n",
    "            else:\n",
    "                tokenized_abstracts[node].append(lemmatizer.lemmatize(word))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "for node in abstracts:\n",
    "    abstracts[node] = set(tokenized_abstracts[node])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "for node in authors:\n",
    "    authors[node] = set(tokenized_authors[node])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "import math\n",
    "epsilon = 1e-6\n",
    "def counter_cosine_similarity(c1, c2):\n",
    "    terms = set(c1).union(c2)\n",
    "    dotprod = sum(c1.get(k, 0) * c2.get(k, 0) for k in terms)\n",
    "    magA = math.sqrt(sum(c1.get(k, 0)**2 for k in terms))\n",
    "    magB = math.sqrt(sum(c2.get(k, 0)**2 for k in terms))\n",
    "    return dotprod / ((magA * magB) + epsilon)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%% md\n"
    }
   },
   "source": [
    "Calculating the cosine similarity between the similar words of the abstracts\n",
    "The vectors represent the frequency of the words in each abstracts"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "import igraph as ig\n",
    "ig_G = ig.Graph.from_networkx(G)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "cluster = nx.clustering(G)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "rank = nx.pagerank(G)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "h,a = nx.hits(G)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "triangles = nx.triangles(G)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "model = Word2Vec(abstracts.values(), window=20, min_count=1, workers=-1,sg=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "model_authors = Word2Vec(authors.values(), window=5, min_count=1, workers=-1,sg=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "#import taggeddocument\n",
    "from gensim.models.doc2vec import TaggedDocument\n",
    "from tqdm import tqdm"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "docs = [TaggedDocument(words=tokenized_abstracts[node], tags=[node]) for node in abstracts]\n",
    "ath_doca = [TaggedDocument(words=tokenized_authors[node], tags=[node]) for node in authors]\n",
    "d2v = Doc2Vec(docs, vector_size=100, window=10, min_count=1, workers= -1)\n",
    "d2v_ath = Doc2Vec(ath_doca, vector_size=100, window=5, min_count=1, workers= -1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "from scipy.spatial.distance import cosine"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "for node in tokenized_abstracts:\n",
    "    if tokenized_abstracts[node] == []:\n",
    "        tokenized_abstracts[node] = ['none']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {},
   "outputs": [],
   "source": [
    "bet = ig_G.betweenness(directed=False, cutoff=5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "205924.46228046715"
      ]
     },
     "execution_count": 50,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "bet[1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Training: 100%|██████████| 1091955/1091955 [45:33<00:00, 399.54it/s] \n"
     ]
    }
   ],
   "source": [
    "# its class label is 1 if it corresponds to an edge and 0, otherwise.\n",
    "# Use the following 3 features for each pair of nodes:\n",
    "# (1) sum of number of unique terms of the two nodes' abstracts\n",
    "# (2) absolute value of difference of number of unique terms of the two nodes' abstracts\n",
    "# (3) number of common terms between the abstracts of the two nodes\n",
    "# (4) sum of number of unique terms of the two nodes' authors\n",
    "# (5) absolute value of difference of number of unique terms of the two nodes' authors\n",
    "# (6) \n",
    "\n",
    "X_train = np.zeros((2*m, 33))\n",
    "y_train = np.zeros(2*m)\n",
    "n = G.number_of_nodes()\n",
    "for i,edge in tqdm(enumerate(G.edges()), desc='Training', total=m):\n",
    "    # an edge\n",
    "    X_train[i,0] = len(abstracts[edge[0]]) + len(abstracts[edge[1]])\n",
    "    X_train[i,1] = abs(len(abstracts[edge[0]]) - len(abstracts[edge[1]]))\n",
    "    X_train[i,2] = len(abstracts[edge[0]].intersection(abstracts[edge[1]]))\n",
    "    X_train[i,3] = len(authors[edge[0]]) + len(authors[edge[1]])\n",
    "    X_train[i,4] = abs(len(authors[edge[0]]) - len(authors[edge[1]]))\n",
    "    X_train[i,5] = len(authors[edge[0]].intersection(authors[edge[1]]))\n",
    "    X_train[i,6] = counter_cosine_similarity(Counter(tokenized_authors[edge[0]]), Counter(tokenized_authors[edge[1]]))\n",
    "    X_train[i,7] = counter_cosine_similarity(Counter(tokenized_abstracts[edge[0]]), Counter(tokenized_abstracts[edge[1]]))\n",
    "    X_train[i,8] = rank[edge[0]] + rank[edge[1]]\n",
    "    X_train[i,9] = abs(rank[edge[0]] - rank[edge[1]])\n",
    "    X_train[i,10] = cluster[edge[0]] + cluster[edge[1]]\n",
    "    X_train[i,11] = abs(cluster[edge[0]] - cluster[edge[1]])\n",
    "    X_train[i,12] = h[edge[0]] + h[edge[1]]\n",
    "    X_train[i,13] = abs(h[edge[0]] - h[edge[1]])\n",
    "    X_train[i,14] = triangles[edge[0]] + triangles[edge[1]]\n",
    "    X_train[i,15] = abs(triangles[edge[0]] - triangles[edge[1]])\n",
    "    X_train[i,16] = nx.degree(G, edge[0]) + nx.degree(G, edge[1])\n",
    "    X_train[i,17] = abs(nx.degree(G, edge[0]) - nx.degree(G, edge[1]))\n",
    "    X_train[i,18] = len(list(nx.common_neighbors(G, edge[0], edge[1])))\n",
    "    X_train[i,19] = n2v.wv.n_similarity(G[edge[0]], G[edge[1]])\n",
    "    X_train[i,20] = n2v.wv.similarity(edge[0], edge[1])\n",
    "    X_train[i,21] = model.wv.n_similarity(tokenized_abstracts[edge[0]], tokenized_abstracts[edge[1]])\n",
    "    X_train[i,22] = model_authors.wv.n_similarity(tokenized_authors[edge[0]], tokenized_authors[edge[1]])\n",
    "    X_train[i,23] = a[edge[0]] + a[edge[1]]\n",
    "    X_train[i,24] = abs(a[edge[0]] - a[edge[1]])\n",
    "    X_train[i,25] = cosine(d2v[edge[0]], d2v[edge[1]])\n",
    "    X_train[i,26] = cosine(d2v_ath[edge[0]], d2v_ath[edge[1]])\n",
    "    X_train[i,27] = diG.in_degree(edge[0]) + diG.in_degree(edge[1])\n",
    "    X_train[i,28] = abs(diG.in_degree(edge[0]) - diG.in_degree(edge[1]))\n",
    "    X_train[i,29] = diG.out_degree(edge[0]) + diG.out_degree(edge[1])\n",
    "    X_train[i,30] = abs(diG.out_degree(edge[0]) - diG.out_degree(edge[1]))\n",
    "    X_train[i,31] = bet[edge[0]] + bet[edge[1]]\n",
    "    X_train[i,32] = abs(bet[edge[0]] - bet[edge[1]])\n",
    "    y_train[i] = 1\n",
    "\n",
    "    # a randomly generated pair of nodes\n",
    "    n1 = randint(0, n-1)\n",
    "    n2 = randint(0, n-1)\n",
    "    while G.has_edge(n1, n2):\n",
    "        n1 = randint(0, n-1)\n",
    "        n2 = randint(0, n-1)\n",
    "    X_train[m+i,0] = len(abstracts[n1]) + len(abstracts[n2])\n",
    "    X_train[m+i,1] = abs(len(abstracts[n1]) - len(abstracts[n2]))\n",
    "    X_train[m+i,2] = len(abstracts[n1].intersection(abstracts[n2]))\n",
    "    X_train[m+i,3] = len(authors[n1]) + len(authors[n2])\n",
    "    X_train[m+i,4] = abs(len(authors[n1]) - len(authors[n2]))\n",
    "    X_train[m+i,5] = len(authors[n1].intersection(authors[n2]))\n",
    "    X_train[m+i,6] = counter_cosine_similarity(Counter(tokenized_authors[n1]), Counter(tokenized_authors[n2]))\n",
    "    X_train[m+i,7] = counter_cosine_similarity(Counter(tokenized_abstracts[n1]), Counter(tokenized_abstracts[n2]))\n",
    "    X_train[m+i,8] = rank[n1] + rank[n2] # sum of ranks of the two nodes\n",
    "    X_train[m+i,9] = abs(rank[n1] - rank[n2]) # absolute value of difference of ranks of the two nodes\n",
    "    X_train[m+i,10] = cluster[n1] + cluster[n2] # sum of clusters of the two nodes\n",
    "    X_train[m+i,11] = abs(cluster[n1] - cluster[n2]) # absolute value of difference of clusters of the two nodes\n",
    "    X_train[m+i,12] = h[n1] + h[n2] # sum of hubs of the two nodes\n",
    "    X_train[m+i,13] = abs(h[n1] - h[n2]) # absolute value of difference of hubs of the two nodes\n",
    "    X_train[m+i,14] = triangles[n1] + triangles[n2] # sum of triangles of the two nodes\n",
    "    X_train[m+i,15] = abs(triangles[n1] - triangles[n2]) # absolute value of difference of triangles of the two nodes\n",
    "    X_train[m+i,16] = nx.degree(G, n1) + nx.degree(G, n2) # sum of degrees of the two nodes\n",
    "    X_train[m+i,17] = abs(nx.degree(G, n1) - nx.degree(G, n2)) # absolute value of difference of degrees of the two nodes\n",
    "    X_train[m+i,18] = len(list(nx.common_neighbors(G, n1, n2))) # number of common neighbors of the two nodes\n",
    "    X_train[m+i,19] = n2v.wv.n_similarity(G[n1], G[n2]) # cosine similarity between the two nodes\n",
    "    X_train[m+i,20] = n2v.wv.similarity(n1, n2) # cosine similarity between the two nodes\n",
    "    X_train[m+i,21] = model.wv.n_similarity(tokenized_abstracts[n1], tokenized_abstracts[n2]) # cosine similarity between the two nodes\n",
    "    X_train[m+i,22] = model_authors.wv.n_similarity(tokenized_authors[n1], tokenized_authors[n2]) # cosine similarity between the two nodes\n",
    "    X_train[m+i,23] = a[n1] + a[n2] # sum of authors of the two nodes\n",
    "    X_train[m+i,24] = abs(a[n1] - a[n2]) # absolute value of difference of authors of the two nodes\n",
    "    X_train[m+i,25] = cosine(d2v[n1], d2v[n2]) # cosine similarity between the two nodes\n",
    "    X_train[m+i,26] = cosine(d2v_ath[n1], d2v_ath[n2]) # cosine similarity between the two nodes\n",
    "    X_train[m+i,27] = diG.in_degree(n1) + diG.in_degree(n2) # sum of in-degrees of the two nodes\n",
    "    X_train[m+i,28] = abs(diG.in_degree(n1) - diG.in_degree(n2)) # absolute value of difference of in-degrees of the two nodes\n",
    "    X_train[m+i,29] = diG.out_degree(n1) + diG.out_degree(n2) # sum of out-degrees of the two nodes\n",
    "    X_train[m+i,30] = abs(diG.out_degree(n1) - diG.out_degree(n2)) # absolute value of difference of out-degrees of the two nodes\n",
    "    X_train[m+i,31] = bet[n1] + bet[n2] # sum of betweenness centrality of the two nodes\n",
    "    X_train[m+i,32] = abs(bet[n1] - bet[n2]) # absolute value of difference of betweenness centrality of the two nodesed \n",
    "    y_train[m+i] = 0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Size of training matrix: (2183910, 33)\n"
     ]
    }
   ],
   "source": [
    "print('Size of training matrix:', X_train.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "# Read test data. Each sample is a pair of nodes\n",
    "node_pairs = list()\n",
    "with open('test.txt', 'r') as f:\n",
    "    for line in f:\n",
    "        t = line.split(',')\n",
    "        node_pairs.append((int(t[0]), int(t[1])))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Creating test matrix: 100%|██████████| 106692/106692 [02:19<00:00, 767.22it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Size of test matrix: (106692, 33)\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "# Create the test matrix. Use the same 4 features as above\n",
    "X_test = np.zeros((len(node_pairs), 33))\n",
    "for i,node_pair in tqdm(enumerate(node_pairs), desc='Creating test matrix', total=len(node_pairs)):\n",
    "    X_test[i,0] = len(abstracts[node_pair[0]]) + len(abstracts[node_pair[1]])\n",
    "    X_test[i,1] = abs(len(abstracts[node_pair[0]]) - len(abstracts[node_pair[1]]))\n",
    "    X_test[i,2] = len(abstracts[node_pair[0]].intersection(abstracts[node_pair[1]]))\n",
    "    X_test[i,3] = len(authors[node_pair[0]]) + len(authors[node_pair[1]])\n",
    "    X_test[i,4] = abs(len(authors[node_pair[0]]) - len(authors[node_pair[1]]))\n",
    "    X_test[i,5] = len(authors[node_pair[0]].intersection(authors[node_pair[1]]))\n",
    "    X_test[i,6] = counter_cosine_similarity(Counter(tokenized_authors[node_pair[0]]), Counter(tokenized_authors[node_pair[1]]))\n",
    "    X_test[i,7] = counter_cosine_similarity(Counter(tokenized_abstracts[node_pair[0]]), Counter(tokenized_abstracts[node_pair[1]]))\n",
    "    X_test[i,8] = rank[node_pair[0]] + rank[node_pair[1]]\n",
    "    X_test[i,9] = abs(rank[node_pair[0]] - rank[node_pair[1]])\n",
    "    X_test[i,10] = cluster[node_pair[0]] + cluster[node_pair[1]]\n",
    "    X_test[i,11] = abs(cluster[node_pair[0]] - cluster[node_pair[1]])\n",
    "    X_test[i,12] = h[node_pair[0]] + h[node_pair[1]]\n",
    "    X_test[i,13] = abs(h[node_pair[0]] - h[node_pair[1]])\n",
    "    X_test[i,14] = triangles[node_pair[0]] + triangles[node_pair[1]]\n",
    "    X_test[i,15] = abs(triangles[node_pair[0]] - triangles[node_pair[1]])\n",
    "    X_test[i,16] = nx.degree(G, node_pair[0]) + nx.degree(G, node_pair[1])\n",
    "    X_test[i,17] = abs(nx.degree(G, node_pair[0]) - nx.degree(G, node_pair[1]))\n",
    "    X_test[i,18] = len(list(nx.common_neighbors(G, node_pair[0], node_pair[1])))\n",
    "    X_test[i,19] = n2v.wv.n_similarity(G[node_pair[0]], G[node_pair[1]])\n",
    "    X_test[i,20] = n2v.wv.similarity(node_pair[0], node_pair[1])\n",
    "    X_test[i,21] = model.wv.n_similarity(tokenized_abstracts[node_pair[0]], tokenized_abstracts[node_pair[1]])\n",
    "    X_test[i,22] = model_authors.wv.n_similarity(tokenized_authors[node_pair[0]], tokenized_authors[node_pair[1]])\n",
    "    X_test[i,23] = a[node_pair[0]] + a[node_pair[1]]\n",
    "    X_test[i,24] = abs(a[node_pair[0]] - a[node_pair[1]])\n",
    "    X_test[i,25] = cosine(d2v[node_pair[0]], d2v[node_pair[1]])\n",
    "    X_test[i,26] = cosine(d2v_ath[node_pair[0]], d2v_ath[node_pair[1]])\n",
    "    X_test[i,27] = diG.in_degree(node_pair[0]) + diG.in_degree(node_pair[1])\n",
    "    X_test[i,28] = abs(diG.in_degree(node_pair[0]) - diG.in_degree(node_pair[1]))\n",
    "    X_test[i,29] = diG.out_degree(node_pair[0]) + diG.out_degree(node_pair[1])\n",
    "    X_test[i,30] = abs(diG.out_degree(node_pair[0]) - diG.out_degree(node_pair[1]))\n",
    "    X_test[i,31] = bet[node_pair[0]] + bet[node_pair[1]]\n",
    "    X_test[i,32] = abs(bet[node_pair[0]] - bet[node_pair[1]])\n",
    "print('Size of test matrix:', X_test.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "X_train, y_train = shuffle(X_train, y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[LightGBM] [Warning] Unknown parameter: random_sate\n"
     ]
    }
   ],
   "source": [
    "import lightgbm as lgb\n",
    "\n",
    "lb = lgb.LGBMClassifier(objective='binary', device='gpu',random_sate=47)\n",
    "lb.fit(X_train, y_train)\n",
    "\n",
    "y_pred_lgb = lb.predict_proba(X_test)\n",
    "y_pred_lgb = y_pred_lgb[:,1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "# Write predictions to a file\n",
    "predictions = zip(range(len(ypred)), ypred)\n",
    "with open(\"submissions.csv\",\"w\") as pred:\n",
    "    csv_out = csv.writer(pred)\n",
    "    csv_out.writerow(['id','predicted'])\n",
    "    for row in predictions:\n",
    "        csv_out.writerow(row)"
   ]
  }
 ],
 "metadata": {
  "interpreter": {
   "hash": "142831d04f5b5b00c5675dbb014767191340933e002f313d1f5b93272cd1b868"
  },
  "kernelspec": {
   "display_name": "Python 3.9.0 64-bit",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
ecosyste.ms

Data

Tools

Indexes

Applications

Experiments

Open Source Science

citation-prediction

Science Score: 18.0%

Repository

Basic Info

Statistics

Metadata Files

README.md

Citation-Prediction

Owner

Citation (citation_prediction.ipynb)

GitHub Events

Total

Last Year