techpal-github-parser/downloader/collect.py

#!/usr/bin/env python

"""Collect data on the most-starred repos using GitHub's GraphQL API."""

import json
import os
import requests
import time
from datetime import datetime, timedelta

from config import Parser

token = Parser.GithubToken

def query(payload, variables=None):
    r = requests.post(
        'https://api.github.com/graphql',
        headers={'Authorization': f'bearer {token}'},
        json={"query": payload, "variables": variables or {}}
    )
    r.raise_for_status()
    return r.json()

repo_query = '''
query popular_repos($start: String, $num: Int!){
  rateLimit {
    cost
    remaining
    resetAt
  }
  search(query: "is:public %s", type: REPOSITORY, first: $num, after: $start) {
    repositoryCount
    pageInfo {
      hasNextPage
      endCursor
    }
    edges {
      node {
        ... on Repository {
          nameWithOwner
          createdAt
          forkCount
          isFork
          updatedAt
          primaryLanguage {
            name
          }
          stargazers {
            totalCount
          }
          watchers {
            totalCount
          }
        }
      }
    }
  }
}
'''

count_query = '''
query {
  rateLimit {
    cost
    remaining
    resetAt
  }
  search(query: "is:public %s", type: REPOSITORY, first: 1) {
    repositoryCount
  }
}
'''

def get_repos(q, cursor, num):
    return query(repo_query % q, {'start': cursor, 'num': num})['data']


def get_count(q):
    return query(count_query % q)['data']['search']['repositoryCount']


def scrape(q, out_file):
    path = f'responses/{out_file}'
    if os.path.exists(path):
        print('Skipping', path, 'already exists')
        return
    all_repos = []
    cursor = None
    print('Creating', path)
    while True:
        r = get_repos(q, cursor, 100)
        search = r['search']
        pi = search['pageInfo']
        cursor = pi['endCursor']
        has_next = pi['hasNextPage']
        total = search['repositoryCount']
        if total > 2000:
            raise ValueError(f'Too many results for {q}: {total}')
        all_repos += [e['node'] for e in search['edges']]
        print(r['rateLimit'])
        print(len(all_repos), ' / ', total, cursor)
        if not has_next or r['rateLimit']['remaining'] < 10:
            break
    #print(all_repos)
    from pymongo import MongoClient
    client = MongoClient("mongodb://admin:admin@localhost:27017")
    db = client['git']
    collection = db['repos']
    for repo in all_repos:
      entity = {}
      filter_dict = {}

      for key, value in repo.items():
        if key == "nameWithOwner":
          filter_dict[key] = value
        else:
          entity[key] = value

      collection.update_one(filter_dict, {"$set": entity}, upsert=True)

    with open(path, 'w') as out:
        print(out)
        #json.dump(all_repos, out)
    time.sleep(4)


def scrape_star_range(low, high):
    """Scrape a simple star range [low, high]."""
    out_file = f'repos.stars={low}..{high}.json'
    q = 'stars:%d..%d' % (low, high)
    scrape(q, out_file)


def scrape_breaks():
    breaks = json.load(open('breaks.json'))
    for hi, lo in zip(breaks[:-1], breaks[1:]):
        scrape_star_range(lo, hi - 1)


def scrape_star_dates():
    for stars in range(123, 15, -1):
        out_file = f'repos.star={stars}.-2015.json'
        q = 'stars:%d created:<=2015' % stars
        scrape(q, out_file)

        out_file = f'repos.star={stars}.2016-.json'
        q = 'stars:%d created:>=2016' % stars
        scrape(q, out_file)


def query_for_star_years(stars, start, end):
    q = 'stars:%s' % stars
    if start == 2010 and end == 2023:
        return q
    elif start == 2010:
        return f'{q} created:<={end}'
    elif end == 2023:
        return f'{q} created:>={start}'
    else:
        return f'{q} created:{start}..{end}'


def split_interval(a, b):
    d = int((b - a) / 2)
    return [(a, a + d), (a + d + 1, b)]


def split_by_year(stars, start, end):
    if start == 2010 and end == 2023:
        c = 1001  # we know this will fail.
    elif start == end:
        split_by_days(
            stars,
            datetime(start, 1, 1),
            datetime(start, 12, 31)
        )
        return
    else:
        q = query_for_star_years(stars, start, end)
        c = get_count(q)
    if c <= 1000:
        out_file = f'repos.star={stars}.{start}-{end}.json'
        print(f'query: {q}')
        scrape(q, out_file)
    else:
        if start == end:
            raise ValueError(f'Can\'t split any more for {stars} / {start}')
        print(f'{stars} {start}..{end} -> {c}, will split')
        for a, b in split_interval(start, end):
            split_by_year(stars, a, b)


def split_by_days(stars, day_start, day_end):
    start_fmt = day_start.strftime('%Y-%m-%d')
    end_fmt = day_end.strftime('%Y-%m-%d')
    q = query_for_star_years(stars, start_fmt, end_fmt)
    c = get_count(q)
    if c <= 1000:
        out_file = f'repos.star={stars}.{start_fmt}-{end_fmt}.json'
        print(f'query: {q}')
        scrape(q, out_file)
    else:
        days = (day_end - day_start).days
        if days == 0:
            raise ValueError(f'Can\'t split any more: {stars} / {day_start} .. {day_end}')
        for a, b in split_interval(0, days):
            dt_a = day_start + timedelta(days=a)
            dt_b = day_start + timedelta(days=b)
            split_by_days(stars, dt_a, dt_b)


def scrape_star_dates_split():
    #for stars in range(83, 15, -1):
    for stars in range(40, 15, -1):
        split_by_year(stars, 2010, 2023)


def scrape_range_days():
    # Scrape from a low star range up, splitting by creation date (which never changes).
    # ranges = [(15, 20), (21, 25), (26, 30), (31, 35), (36, 40), (41, 45), (46, 49)]
    #ranges = [(50, 60), (61, 70), (71, 80), (81, 90), (91, 100)]
    #ranges = ranges + [(100, 119), (120, 139), (140, 159), (160, 179), (180, 200)]
    #ranges = ranges + [(201, 225), (226, 250), (251, 300), (301, 400), (401, 500)]
    #ranges = ranges + [(501, 700), (701, 1000), (1001, 1500), (1501, 5000), (5001, 1_000_000)]
    ranges = [(1001, 1500), (1501, 5000), (5001, 1_000_000)]
    for a, b in ranges:
        stars = f'{a}..{b}'
        split_by_days(stars, datetime(2007, 1, 1), datetime(2024, 2, 2))


#if __name__ == '__main__':
    # scrape_breaks()
    # scrape_star_dates()
    # scrape_star_dates_split()
    #scrape_range_days()