244 lines
6.5 KiB
Python
244 lines
6.5 KiB
Python
#!/usr/bin/env python
|
|
|
|
"""Collect data on the most-starred repos using GitHub's GraphQL API."""
|
|
|
|
import json
|
|
import os
|
|
import requests
|
|
import time
|
|
from datetime import datetime, timedelta
|
|
from requests.adapters import HTTPAdapter
|
|
from urllib3.util.retry import Retry
|
|
|
|
from config import Parser
|
|
|
|
token = Parser.GithubToken
|
|
|
|
def query(payload, variables=None):
|
|
session = requests.Session()
|
|
session.headers.update({'Authorization': f'bearer {token}'})
|
|
retry = Retry(connect=3, backoff_factor=0.5)
|
|
adapter = HTTPAdapter(max_retries=retry)
|
|
session.mount('http://', adapter)
|
|
session.mount('https://', adapter)
|
|
|
|
r = session.post(
|
|
'https://api.github.com/graphql',
|
|
json={"query": payload, "variables": variables or {}}
|
|
)
|
|
r.raise_for_status()
|
|
return r.json()
|
|
|
|
repo_query = '''
|
|
query popular_repos($start: String, $num: Int!){
|
|
rateLimit {
|
|
cost
|
|
remaining
|
|
resetAt
|
|
}
|
|
search(query: "is:public %s", type: REPOSITORY, first: $num, after: $start) {
|
|
repositoryCount
|
|
pageInfo {
|
|
hasNextPage
|
|
endCursor
|
|
}
|
|
edges {
|
|
node {
|
|
... on Repository {
|
|
nameWithOwner
|
|
createdAt
|
|
forkCount
|
|
isFork
|
|
updatedAt
|
|
primaryLanguage {
|
|
name
|
|
}
|
|
stargazers {
|
|
totalCount
|
|
}
|
|
watchers {
|
|
totalCount
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
'''
|
|
|
|
count_query = '''
|
|
query {
|
|
rateLimit {
|
|
cost
|
|
remaining
|
|
resetAt
|
|
}
|
|
search(query: "is:public %s", type: REPOSITORY, first: 1) {
|
|
repositoryCount
|
|
}
|
|
}
|
|
'''
|
|
|
|
def get_repos(q, cursor, num):
|
|
return query(repo_query % q, {'start': cursor, 'num': num})['data']
|
|
|
|
|
|
def get_count(q):
|
|
return query(count_query % q)['data']['search']['repositoryCount']
|
|
|
|
|
|
def scrape(q, out_file):
|
|
#path = f'responses/{out_file}'
|
|
#if os.path.exists(path):
|
|
# print('Skipping', path, 'already exists')
|
|
# return
|
|
all_repos = []
|
|
cursor = None
|
|
#print('Creating', path)
|
|
while True:
|
|
r = get_repos(q, cursor, 100)
|
|
search = r['search']
|
|
pi = search['pageInfo']
|
|
cursor = pi['endCursor']
|
|
has_next = pi['hasNextPage']
|
|
total = search['repositoryCount']
|
|
#if total > 2000:
|
|
# raise ValueError(f'Too many results for {q}: {total}')
|
|
all_repos += [e['node'] for e in search['edges']]
|
|
print(r['rateLimit'])
|
|
print(len(all_repos), ' / ', total, cursor)
|
|
if not has_next or r['rateLimit']['remaining'] < 10:
|
|
break
|
|
#print(all_repos)
|
|
from pymongo import MongoClient
|
|
client = MongoClient("mongodb://admin:admin@localhost:27017")
|
|
db = client['git']
|
|
collection = db['repos']
|
|
for repo in all_repos:
|
|
entity = {}
|
|
filter_dict = {}
|
|
|
|
for key, value in repo.items():
|
|
if key == "nameWithOwner":
|
|
filter_dict[key] = value
|
|
else:
|
|
entity[key] = value
|
|
|
|
collection.update_one(filter_dict, {"$set": entity}, upsert=True)
|
|
|
|
#with open(path, 'w') as out:
|
|
#print(out)
|
|
#json.dump(all_repos, out)
|
|
time.sleep(4)
|
|
|
|
|
|
def scrape_star_range(low, high):
|
|
"""Scrape a simple star range [low, high]."""
|
|
out_file = f'repos.stars={low}..{high}.json'
|
|
q = 'stars:%d..%d' % (low, high)
|
|
scrape(q, out_file)
|
|
|
|
|
|
def scrape_breaks():
|
|
breaks = json.load(open('breaks.json'))
|
|
for hi, lo in zip(breaks[:-1], breaks[1:]):
|
|
scrape_star_range(lo, hi - 1)
|
|
|
|
|
|
def scrape_star_dates():
|
|
for stars in range(123, 15, -1):
|
|
out_file = f'repos.star={stars}.-2015.json'
|
|
q = 'stars:%d created:<=2015' % stars
|
|
scrape(q, out_file)
|
|
|
|
out_file = f'repos.star={stars}.2016-.json'
|
|
q = 'stars:%d created:>=2016' % stars
|
|
scrape(q, out_file)
|
|
|
|
|
|
def query_for_star_years(stars, start, end):
|
|
q = 'stars:%s' % stars
|
|
if start == 2010 and end == 2023:
|
|
return q
|
|
elif start == 2010:
|
|
return f'{q} created:<={end}'
|
|
elif end == 2023:
|
|
return f'{q} created:>={start}'
|
|
else:
|
|
return f'{q} created:{start}..{end}'
|
|
|
|
|
|
def split_interval(a, b):
|
|
d = int((b - a) / 2)
|
|
return [(a, a + d), (a + d + 1, b)]
|
|
|
|
|
|
def split_by_year(stars, start, end):
|
|
if start == 2010 and end == 2023:
|
|
c = 1001 # we know this will fail.
|
|
elif start == end:
|
|
split_by_days(
|
|
stars,
|
|
datetime(start, 1, 1),
|
|
datetime(start, 12, 31)
|
|
)
|
|
return
|
|
else:
|
|
q = query_for_star_years(stars, start, end)
|
|
c = get_count(q)
|
|
if c <= 1000:
|
|
out_file = f'repos.star={stars}.{start}-{end}.json'
|
|
print(f'query: {q}')
|
|
scrape(q, out_file)
|
|
else:
|
|
if start == end:
|
|
raise ValueError(f'Can\'t split any more for {stars} / {start}')
|
|
print(f'{stars} {start}..{end} -> {c}, will split')
|
|
for a, b in split_interval(start, end):
|
|
split_by_year(stars, a, b)
|
|
|
|
|
|
def split_by_days(stars, day_start, day_end):
|
|
start_fmt = day_start.strftime('%Y-%m-%d')
|
|
end_fmt = day_end.strftime('%Y-%m-%d')
|
|
q = query_for_star_years(stars, start_fmt, end_fmt)
|
|
c = get_count(q)
|
|
if c <= 1000:
|
|
out_file = f'repos.star={stars}.{start_fmt}-{end_fmt}.json'
|
|
print(f'query: {q}')
|
|
scrape(q, out_file)
|
|
else:
|
|
days = (day_end - day_start).days
|
|
if days == 0:
|
|
raise ValueError(f'Can\'t split any more: {stars} / {day_start} .. {day_end}')
|
|
for a, b in split_interval(0, days):
|
|
dt_a = day_start + timedelta(days=a)
|
|
dt_b = day_start + timedelta(days=b)
|
|
split_by_days(stars, dt_a, dt_b)
|
|
|
|
|
|
def scrape_star_dates_split():
|
|
#for stars in range(83, 15, -1):
|
|
for stars in range(40, 15, -1):
|
|
split_by_year(stars, 2010, 2023)
|
|
|
|
|
|
def scrape_range_days():
|
|
# Scrape from a low star range up, splitting by creation date (which never changes).
|
|
# ranges = [(15, 20), (21, 25), (26, 30), (31, 35), (36, 40), (41, 45), (46, 49)]
|
|
#ranges = [(50, 60), (61, 70), (71, 80), (81, 90), (91, 100)]
|
|
#ranges = ranges + [(100, 119), (120, 139), (140, 159), (160, 179), (180, 200)]
|
|
#ranges = ranges + [(201, 225), (226, 250), (251, 300), (301, 400), (401, 500)]
|
|
#ranges = ranges + [(501, 700), (701, 1000), (1001, 1500), (1501, 5000), (5001, 1_000_000)]
|
|
ranges = [(1001, 1500), (1501, 5000), (5001, 1_000_000)]
|
|
for a, b in ranges:
|
|
stars = f'{a}..{b}'
|
|
split_by_days(stars, datetime(2007, 1, 1), datetime(2024, 2, 2))
|
|
|
|
|
|
#if __name__ == '__main__':
|
|
# scrape_breaks()
|
|
# scrape_star_dates()
|
|
# scrape_star_dates_split()
|
|
#scrape_range_days()
|