#!/usr/bin/env python """Collect data on the most-starred repos using GitHub's GraphQL API.""" import json import os import requests import time from datetime import datetime, timedelta from config import Parser token = Parser.GithubToken def query(payload, variables=None): r = requests.post( 'https://api.github.com/graphql', headers={'Authorization': f'bearer {token}'}, json={"query": payload, "variables": variables or {}} ) r.raise_for_status() return r.json() repo_query = ''' query popular_repos($start: String, $num: Int!){ rateLimit { cost remaining resetAt } search(query: "is:public %s", type: REPOSITORY, first: $num, after: $start) { repositoryCount pageInfo { hasNextPage endCursor } edges { node { ... on Repository { nameWithOwner createdAt forkCount isFork updatedAt primaryLanguage { name } stargazers { totalCount } watchers { totalCount } } } } } } ''' count_query = ''' query { rateLimit { cost remaining resetAt } search(query: "is:public %s", type: REPOSITORY, first: 1) { repositoryCount } } ''' def get_repos(q, cursor, num): return query(repo_query % q, {'start': cursor, 'num': num})['data'] def get_count(q): return query(count_query % q)['data']['search']['repositoryCount'] def scrape(q, out_file): path = f'responses/{out_file}' if os.path.exists(path): print('Skipping', path, 'already exists') return all_repos = [] cursor = None print('Creating', path) while True: r = get_repos(q, cursor, 100) search = r['search'] pi = search['pageInfo'] cursor = pi['endCursor'] has_next = pi['hasNextPage'] total = search['repositoryCount'] if total > 2000: raise ValueError(f'Too many results for {q}: {total}') all_repos += [e['node'] for e in search['edges']] print(r['rateLimit']) print(len(all_repos), ' / ', total, cursor) if not has_next or r['rateLimit']['remaining'] < 10: break #print(all_repos) from pymongo import MongoClient client = MongoClient("mongodb://admin:admin@localhost:27017") db = client['git'] collection = db['repos'] for repo in all_repos: entity = {} filter_dict = {} for key, value in repo.items(): if key == "nameWithOwner": filter_dict[key] = value else: entity[key] = value collection.update_one(filter_dict, {"$set": entity}, upsert=True) with open(path, 'w') as out: print(out) #json.dump(all_repos, out) time.sleep(4) def scrape_star_range(low, high): """Scrape a simple star range [low, high].""" out_file = f'repos.stars={low}..{high}.json' q = 'stars:%d..%d' % (low, high) scrape(q, out_file) def scrape_breaks(): breaks = json.load(open('breaks.json')) for hi, lo in zip(breaks[:-1], breaks[1:]): scrape_star_range(lo, hi - 1) def scrape_star_dates(): for stars in range(123, 15, -1): out_file = f'repos.star={stars}.-2015.json' q = 'stars:%d created:<=2015' % stars scrape(q, out_file) out_file = f'repos.star={stars}.2016-.json' q = 'stars:%d created:>=2016' % stars scrape(q, out_file) def query_for_star_years(stars, start, end): q = 'stars:%s' % stars if start == 2010 and end == 2023: return q elif start == 2010: return f'{q} created:<={end}' elif end == 2023: return f'{q} created:>={start}' else: return f'{q} created:{start}..{end}' def split_interval(a, b): d = int((b - a) / 2) return [(a, a + d), (a + d + 1, b)] def split_by_year(stars, start, end): if start == 2010 and end == 2023: c = 1001 # we know this will fail. elif start == end: split_by_days( stars, datetime(start, 1, 1), datetime(start, 12, 31) ) return else: q = query_for_star_years(stars, start, end) c = get_count(q) if c <= 1000: out_file = f'repos.star={stars}.{start}-{end}.json' print(f'query: {q}') scrape(q, out_file) else: if start == end: raise ValueError(f'Can\'t split any more for {stars} / {start}') print(f'{stars} {start}..{end} -> {c}, will split') for a, b in split_interval(start, end): split_by_year(stars, a, b) def split_by_days(stars, day_start, day_end): start_fmt = day_start.strftime('%Y-%m-%d') end_fmt = day_end.strftime('%Y-%m-%d') q = query_for_star_years(stars, start_fmt, end_fmt) c = get_count(q) if c <= 1000: out_file = f'repos.star={stars}.{start_fmt}-{end_fmt}.json' print(f'query: {q}') scrape(q, out_file) else: days = (day_end - day_start).days if days == 0: raise ValueError(f'Can\'t split any more: {stars} / {day_start} .. {day_end}') for a, b in split_interval(0, days): dt_a = day_start + timedelta(days=a) dt_b = day_start + timedelta(days=b) split_by_days(stars, dt_a, dt_b) def scrape_star_dates_split(): #for stars in range(83, 15, -1): for stars in range(40, 15, -1): split_by_year(stars, 2010, 2023) def scrape_range_days(): # Scrape from a low star range up, splitting by creation date (which never changes). # ranges = [(15, 20), (21, 25), (26, 30), (31, 35), (36, 40), (41, 45), (46, 49)] #ranges = [(50, 60), (61, 70), (71, 80), (81, 90), (91, 100)] #ranges = ranges + [(100, 119), (120, 139), (140, 159), (160, 179), (180, 200)] #ranges = ranges + [(201, 225), (226, 250), (251, 300), (301, 400), (401, 500)] #ranges = ranges + [(501, 700), (701, 1000), (1001, 1500), (1501, 5000), (5001, 1_000_000)] ranges = [(1001, 1500), (1501, 5000), (5001, 1_000_000)] for a, b in ranges: stars = f'{a}..{b}' split_by_days(stars, datetime(2007, 1, 1), datetime(2024, 2, 2)) #if __name__ == '__main__': # scrape_breaks() # scrape_star_dates() # scrape_star_dates_split() #scrape_range_days()