Last month, I developed a script because lemmy.ml had become too slow. Unfortunately, I have the same problem again, but this time there are too many instances to evaluate, causing the script to take an excessively long time to complete. I’m seeking advice on how to enhance the script to simultaneously ping multiple instances. Are there any alternative scripts available that might provide a more efficient solution?
git clone https://github.com/LemmyNet/lemmy-stats-crawler
cd lemmy-stats-crawler
cargo run -- --json > stats.json
#!/usr/bin/env python3
import json
import time
import requests
import requests.exceptions
from typing import List, Dict
TIME_BETWEEN_REQUESTS = 5 # 10 * 60 = 10 minutes
TIME_TOTAL = 60 # 8 * 60 * 60 = 8 hours
def get_latency(domain):
try:
start = time.time()
if not domain.startswith(("http://", "https://")):
domain = "https://" + domain
requests.get(domain, timeout=3)
end = time.time()
return end - start
except requests.exceptions.Timeout:
return float("inf")
def measure_latencies(domains, duration):
latencies = {}
start_time = time.time()
end_time = start_time + duration
while time.time() < end_time:
latencies = measure_latencies_for_domains(domains, latencies)
time.sleep(TIME_BETWEEN_REQUESTS)
return latencies
def measure_latencies_for_domains(domains, latencies):
for domain in domains:
latency = get_latency(domain)
latencies = add_latency_to_domain(domain, latency, latencies)
return latencies
def add_latency_to_domain(domain, latency, latencies):
if domain not in latencies:
latencies[domain] = []
latencies[domain].append(latency)
return latencies
def average_latencies(latencies):
averages = []
for domain, latency_list in latencies.items():
avg_latency = sum(latency_list) / len(latency_list)
averages.append((domain, avg_latency))
return averages
def sort_latencies(averages):
return sorted(averages, key=lambda x: x[1])
def get_latency_report(domains, duration):
latencies = measure_latencies(domains, duration)
averages = average_latencies(latencies)
return sort_latencies(averages)
def get_instances(data: Dict) -> List[Dict]:
instances = []
for instance_details in data["instance_details"]:
instances.append(instance_details)
return instances
def get_domains(instances: List[Dict]) -> List[str]:
return [instance["domain"] for instance in instances]
def load_json_data(filepath: str) -> Dict:
with open(filepath) as json_data:
return json.load(json_data)
def main():
data = load_json_data('stats.json')
instances = get_instances(data)
domains = get_domains(instances)
report = get_latency_report(domains, TIME_TOTAL)
for domain, avg_latency in report:
print(f"{domain}: {avg_latency:.2f} seconds")
if __name__ == "__main__":
main()
You must log in or # to comment.
Sounds like it would be worth looking at the asyncio package.
This is the way.
Yeah, can use httpx or aiohttp for sending requests.
Second this as many major performance improvements have been made for it in latest.
Create a multi threaded script with consuming threads and a queue