This commit is contained in:
Sviatoslav Tsariov Yurievich 2023-09-13 18:15:40 +03:00
parent d94a244bc4
commit 3b9a576290

View File

@ -7,15 +7,23 @@ import os
import requests import requests
import time import time
from datetime import datetime, timedelta from datetime import datetime, timedelta
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
from config import Parser from config import Parser
token = Parser.GithubToken token = Parser.GithubToken
def query(payload, variables=None): def query(payload, variables=None):
r = requests.post( session = requests.Session()
session.headers.update({'Authorization': f'bearer {token}'})
retry = Retry(connect=3, backoff_factor=0.5)
adapter = HTTPAdapter(max_retries=retry)
session.mount('http://', adapter)
session.mount('https://', adapter)
r = session.post(
'https://api.github.com/graphql', 'https://api.github.com/graphql',
headers={'Authorization': f'bearer {token}'},
json={"query": payload, "variables": variables or {}} json={"query": payload, "variables": variables or {}}
) )
r.raise_for_status() r.raise_for_status()
@ -80,13 +88,13 @@ def get_count(q):
def scrape(q, out_file): def scrape(q, out_file):
path = f'responses/{out_file}' #path = f'responses/{out_file}'
if os.path.exists(path): #if os.path.exists(path):
print('Skipping', path, 'already exists') # print('Skipping', path, 'already exists')
return # return
all_repos = [] all_repos = []
cursor = None cursor = None
print('Creating', path) #print('Creating', path)
while True: while True:
r = get_repos(q, cursor, 100) r = get_repos(q, cursor, 100)
search = r['search'] search = r['search']
@ -94,8 +102,8 @@ def scrape(q, out_file):
cursor = pi['endCursor'] cursor = pi['endCursor']
has_next = pi['hasNextPage'] has_next = pi['hasNextPage']
total = search['repositoryCount'] total = search['repositoryCount']
if total > 2000: #if total > 2000:
raise ValueError(f'Too many results for {q}: {total}') # raise ValueError(f'Too many results for {q}: {total}')
all_repos += [e['node'] for e in search['edges']] all_repos += [e['node'] for e in search['edges']]
print(r['rateLimit']) print(r['rateLimit'])
print(len(all_repos), ' / ', total, cursor) print(len(all_repos), ' / ', total, cursor)
@ -118,8 +126,8 @@ def scrape(q, out_file):
collection.update_one(filter_dict, {"$set": entity}, upsert=True) collection.update_one(filter_dict, {"$set": entity}, upsert=True)
with open(path, 'w') as out: #with open(path, 'w') as out:
print(out) #print(out)
#json.dump(all_repos, out) #json.dump(all_repos, out)
time.sleep(4) time.sleep(4)