Source code for federatedscope.gfl.dataset.dblp_new

import os.path as osp
import numpy as np
import networkx as nx
import torch
from torch_geometric.data import InMemoryDataset, download_url
from torch_geometric.utils import from_networkx
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction._stop_words import ENGLISH_STOP_WORDS as \
    sklearn_stopwords


class LemmaTokenizer(object):
    def __init__(self):
        from nltk.stem import WordNetLemmatizer
        self.wnl = WordNetLemmatizer()

    def __call__(self, doc):
        from nltk import word_tokenize
        return [self.wnl.lemmatize(t) for t in word_tokenize(doc)]


def build_feature(words, threshold):
    from nltk.corpus import stopwords as nltk_stopwords
    # use bag-of-words representation of paper titles as the features of papers
    stopwords = sklearn_stopwords.union(set(nltk_stopwords.words('english')))
    vectorizer = CountVectorizer(min_df=int(threshold),
                                 stop_words=stopwords,
                                 tokenizer=LemmaTokenizer())
    features_paper = vectorizer.fit_transform(words)

    return features_paper


def build_graph(path, filename, FL=0, threshold=15):
    with open(osp.join(path, filename), 'r') as f:
        node_cnt = sum([1 for line in f])

    G = nx.DiGraph()
    desc = node_cnt * [None]
    neighbors = node_cnt * [None]
    if FL == 1:
        conf2paper = dict()
    elif FL == 2:
        org2paper = dict()

    # Build node feature from title
    with open(osp.join(path, filename), 'r') as f:
        for line in f:
            cols = line.strip().split('\t')
            nid, title = int(cols[0]), cols[3]
            desc[nid] = title

    features = np.array(build_feature(desc, threshold).todense(),
                        dtype=np.float32)

    # Build graph structure
    with open(osp.join(path, filename), 'r') as f:
        for line in f:
            cols = line.strip().split('\t')
            nid, conf, org, label = int(cols[0]), cols[1], cols[2], int(
                cols[4])
            neighbors[nid] = [int(val) for val in cols[-1].split(',')]

            if FL == 1:
                if conf not in conf2paper:
                    conf2paper[conf] = [nid]
                else:
                    conf2paper[conf].append(nid)
            elif FL == 2:
                if org not in org2paper:
                    org2paper[org] = [nid]
                else:
                    org2paper[org].append(nid)

            G.add_node(nid, y=label, x=features[nid], index_orig=nid)

    for nid, nbs in enumerate(neighbors):
        for vid in nbs:
            G.add_edge(nid, vid)

    # Sort node id for index_orig
    H = nx.Graph()
    H.add_nodes_from(sorted(G.nodes(data=True)))
    H.add_edges_from(G.edges(data=True))
    G = H
    graphs = []
    if FL == 1:
        for conf in conf2paper:
            graphs.append(from_networkx(nx.subgraph(G, conf2paper[conf])))
    elif FL == 2:
        for org in org2paper:
            graphs.append(from_networkx(nx.subgraph(G, org2paper[org])))
    else:
        graphs.append(from_networkx(G))

    return graphs


[docs]class DBLPNew(InMemoryDataset): r""" Args: root (string): Root directory where the dataset should be saved. FL (Bool): Federated setting, `0` for DBLP, `1` for FLDBLPbyConf, `2` for FLDBLPbyOrg transform (callable, optional): A function/transform that takes in an :obj:`torch_geometric.data.Data` object and returns a transformed version. The data object will be transformed before every access. (default: :obj:`None`) pre_transform (callable, optional): A function/transform that takes in an :obj:`torch_geometric.data.Data` object and returns a transformed version. The data object will be transformed before being saved to disk. (default: :obj:`None`) """ def __init__(self, root, FL=0, splits=[0.5, 0.2, 0.3], transform=None, pre_transform=None): self.FL = FL if self.FL == 0: self.name = 'DBLPNew' elif self.FL == 1: self.name = 'FLDBLPbyConf' else: self.name = 'FLDBLPbyOrg' self._customized_splits = splits super(DBLPNew, self).__init__(root, transform, pre_transform) self.data, self.slices = torch.load(self.processed_paths[0]) @property def raw_file_names(self): names = ['dblp_new.tsv'] return names @property def processed_file_names(self): return ['data.pt'] @property def raw_dir(self): return osp.join(self.root, self.name, 'raw') @property def processed_dir(self): return osp.join(self.root, self.name, 'processed')
[docs] def download(self): # Download to `self.raw_dir`. url = 'https://federatedscope.oss-cn-beijing.aliyuncs.com' for name in self.raw_file_names: download_url(f'{url}/{name}', self.raw_dir)
[docs] def process(self): # Read data into huge `Data` list. data_list = build_graph(self.raw_dir, self.raw_file_names[0], self.FL) data_list_w_masks = [] for data in data_list: if data.num_nodes == 0: continue indices = torch.randperm(data.num_nodes) data.train_mask = torch.zeros(data.num_nodes, dtype=torch.bool) data.train_mask[indices[:round(self._customized_splits[0] * len(data.y))]] = True data.val_mask = torch.zeros(data.num_nodes, dtype=torch.bool) data.val_mask[ indices[round(self._customized_splits[0] * len(data.y)):round((self._customized_splits[0] + self._customized_splits[1]) * len(data.y))]] = True data.test_mask = torch.zeros(data.num_nodes, dtype=torch.bool) data.test_mask[indices[round((self._customized_splits[0] + self._customized_splits[1]) * len(data.y)):]] = True data_list_w_masks.append(data) data_list = data_list_w_masks if self.pre_filter is not None: data_list = [data for data in data_list if self.pre_filter(data)] if self.pre_transform is not None: data_list = [self.pre_transform(data) for data in data_list] data, slices = self.collate(data_list) torch.save((data, slices), self.processed_paths[0])