This commit is contained in:
Sviatoslav Tsariov Yurievich 2023-09-13 18:15:40 +03:00
parent d94a244bc4
commit 3b9a576290

View File

@ -7,15 +7,23 @@ import os
import requests
import time
from datetime import datetime, timedelta
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
from config import Parser
token = Parser.GithubToken
def query(payload, variables=None):
r = requests.post(
session = requests.Session()
session.headers.update({'Authorization': f'bearer {token}'})
retry = Retry(connect=3, backoff_factor=0.5)
adapter = HTTPAdapter(max_retries=retry)
session.mount('http://', adapter)
session.mount('https://', adapter)
r = session.post(
'https://api.github.com/graphql',
headers={'Authorization': f'bearer {token}'},
json={"query": payload, "variables": variables or {}}
)
r.raise_for_status()
@ -80,13 +88,13 @@ def get_count(q):
def scrape(q, out_file):
path = f'responses/{out_file}'
if os.path.exists(path):
print('Skipping', path, 'already exists')
return
#path = f'responses/{out_file}'
#if os.path.exists(path):
# print('Skipping', path, 'already exists')
# return
all_repos = []
cursor = None
print('Creating', path)
#print('Creating', path)
while True:
r = get_repos(q, cursor, 100)
search = r['search']
@ -94,8 +102,8 @@ def scrape(q, out_file):
cursor = pi['endCursor']
has_next = pi['hasNextPage']
total = search['repositoryCount']
if total > 2000:
raise ValueError(f'Too many results for {q}: {total}')
#if total > 2000:
# raise ValueError(f'Too many results for {q}: {total}')
all_repos += [e['node'] for e in search['edges']]
print(r['rateLimit'])
print(len(all_repos), ' / ', total, cursor)
@ -118,8 +126,8 @@ def scrape(q, out_file):
collection.update_one(filter_dict, {"$set": entity}, upsert=True)
with open(path, 'w') as out:
print(out)
#with open(path, 'w') as out:
#print(out)
#json.dump(all_repos, out)
time.sleep(4)