Compare commits

...

6 Commits

3
.gitignore vendored

@ -0,0 +1,3 @@
build/
dist/
*.egg-info/

@ -4,6 +4,7 @@ url = "https://pypi.org/simple"
verify_ssl = true
[dev-packages]
twine = "*"
[packages]
requests = "~=2.24"

283
Pipfile.lock generated

@ -1,7 +1,7 @@
{
"_meta": {
"hash": {
"sha256": "38279dd8a59254b5d642ef1254d1f3d27e89ccc4b2dede749e4bd82a836c34d5"
"sha256": "68c900522d72644f8d7c736e35e8e0cc807c26e3b695dabdf5d4c4e146cbb4ed"
},
"pipfile-spec": 6,
"requires": {
@ -18,12 +18,12 @@
"default": {
"beautifulsoup4": {
"hashes": [
"sha256:73cc4d115b96f79c7d77c1c7f7a0a8d4c57860d1041df407dd1aae7f07a77fd7",
"sha256:a6237df3c32ccfaee4fd201c8f5f9d9df619b93121d01353a64a73ce8c6ef9a8",
"sha256:e718f2342e2e099b640a34ab782407b7b676f47ee272d6739e60b8ea23829f2c"
"sha256:4c98143716ef1cb40bf7f39a8e3eec8f8b009509e74904ba3a7b315431577e35",
"sha256:84729e322ad1d5b4d25f805bfa05b902dd96450f43842c4e99067d5e1369eb25",
"sha256:fff47e031e34ec82bf17e00da8f592fe7de69aeea38be00523c04623c04fb666"
],
"index": "pypi",
"version": "==4.9.1"
"version": "==4.9.3"
},
"certifi": {
"hashes": [
@ -76,16 +76,16 @@
"sha256:1634eea42ab371d3d346309b93df7870a88610f0725d47528be902a0d95ecc55",
"sha256:a59dc181727e95d25f781f0eb4fd1825ff45590ec8ff49eadfd7f1a537cc0232"
],
"markers": "python_version >= '3.5'",
"markers": "python_version >= '3.0'",
"version": "==2.0.1"
},
"urllib3": {
"hashes": [
"sha256:91056c15fa70756691db97756772bb1eb9678fa585d9184f24534b100dc60f4a",
"sha256:e7983572181f5e1522d9c98453462384ee92a0be7fac5f1413a1e35c56cc0461"
"sha256:8d7eaa5a82a1cac232164990f04874c594c9453ec55eef02eab885aa02fc17a2",
"sha256:f5321fbe4bf3fefa0efd0bfe7fb14e90909eb62a48ccda331726b4319897dd5e"
],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_version < '4'",
"version": "==1.25.10"
"version": "==1.25.11"
},
"webencodings": {
"hashes": [
@ -95,5 +95,268 @@
"version": "==0.5.1"
}
},
"develop": {}
"develop": {
"bleach": {
"hashes": [
"sha256:52b5919b81842b1854196eaae5ca29679a2f2e378905c346d3ca8227c2c66080",
"sha256:9f8ccbeb6183c6e6cddea37592dfb0167485c1e3b13b3363bc325aa8bda3adbd"
],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
"version": "==3.2.1"
},
"certifi": {
"hashes": [
"sha256:5930595817496dd21bb8dc35dad090f1c2cd0adfaf21204bf6732ca5d8ee34d3",
"sha256:8fc0819f1f30ba15bdb34cceffb9ef04d99f420f68eb75d901e9560b8749fc41"
],
"version": "==2020.6.20"
},
"cffi": {
"hashes": [
"sha256:005f2bfe11b6745d726dbb07ace4d53f057de66e336ff92d61b8c7e9c8f4777d",
"sha256:09e96138280241bd355cd585148dec04dbbedb4f46128f340d696eaafc82dd7b",
"sha256:0b1ad452cc824665ddc682400b62c9e4f5b64736a2ba99110712fdee5f2505c4",
"sha256:0ef488305fdce2580c8b2708f22d7785ae222d9825d3094ab073e22e93dfe51f",
"sha256:15f351bed09897fbda218e4db5a3d5c06328862f6198d4fb385f3e14e19decb3",
"sha256:22399ff4870fb4c7ef19fff6eeb20a8bbf15571913c181c78cb361024d574579",
"sha256:23e5d2040367322824605bc29ae8ee9175200b92cb5483ac7d466927a9b3d537",
"sha256:2791f68edc5749024b4722500e86303a10d342527e1e3bcac47f35fbd25b764e",
"sha256:2f9674623ca39c9ebe38afa3da402e9326c245f0f5ceff0623dccdac15023e05",
"sha256:3363e77a6176afb8823b6e06db78c46dbc4c7813b00a41300a4873b6ba63b171",
"sha256:33c6cdc071ba5cd6d96769c8969a0531be2d08c2628a0143a10a7dcffa9719ca",
"sha256:3b8eaf915ddc0709779889c472e553f0d3e8b7bdf62dab764c8921b09bf94522",
"sha256:3cb3e1b9ec43256c4e0f8d2837267a70b0e1ca8c4f456685508ae6106b1f504c",
"sha256:3eeeb0405fd145e714f7633a5173318bd88d8bbfc3dd0a5751f8c4f70ae629bc",
"sha256:44f60519595eaca110f248e5017363d751b12782a6f2bd6a7041cba275215f5d",
"sha256:4d7c26bfc1ea9f92084a1d75e11999e97b62d63128bcc90c3624d07813c52808",
"sha256:529c4ed2e10437c205f38f3691a68be66c39197d01062618c55f74294a4a4828",
"sha256:6642f15ad963b5092d65aed022d033c77763515fdc07095208f15d3563003869",
"sha256:85ba797e1de5b48aa5a8427b6ba62cf69607c18c5d4eb747604b7302f1ec382d",
"sha256:8f0f1e499e4000c4c347a124fa6a27d37608ced4fe9f7d45070563b7c4c370c9",
"sha256:a624fae282e81ad2e4871bdb767e2c914d0539708c0f078b5b355258293c98b0",
"sha256:b0358e6fefc74a16f745afa366acc89f979040e0cbc4eec55ab26ad1f6a9bfbc",
"sha256:bbd2f4dfee1079f76943767fce837ade3087b578aeb9f69aec7857d5bf25db15",
"sha256:bf39a9e19ce7298f1bd6a9758fa99707e9e5b1ebe5e90f2c3913a47bc548747c",
"sha256:c11579638288e53fc94ad60022ff1b67865363e730ee41ad5e6f0a17188b327a",
"sha256:c150eaa3dadbb2b5339675b88d4573c1be3cb6f2c33a6c83387e10cc0bf05bd3",
"sha256:c53af463f4a40de78c58b8b2710ade243c81cbca641e34debf3396a9640d6ec1",
"sha256:cb763ceceae04803adcc4e2d80d611ef201c73da32d8f2722e9d0ab0c7f10768",
"sha256:cc75f58cdaf043fe6a7a6c04b3b5a0e694c6a9e24050967747251fb80d7bce0d",
"sha256:d80998ed59176e8cba74028762fbd9b9153b9afc71ea118e63bbf5d4d0f9552b",
"sha256:de31b5164d44ef4943db155b3e8e17929707cac1e5bd2f363e67a56e3af4af6e",
"sha256:e66399cf0fc07de4dce4f588fc25bfe84a6d1285cc544e67987d22663393926d",
"sha256:f0620511387790860b249b9241c2f13c3a80e21a73e0b861a2df24e9d6f56730",
"sha256:f4eae045e6ab2bb54ca279733fe4eb85f1effda392666308250714e01907f394",
"sha256:f92cdecb618e5fa4658aeb97d5eb3d2f47aa94ac6477c6daf0f306c5a3b9e6b1",
"sha256:f92f789e4f9241cd262ad7a555ca2c648a98178a953af117ef7fad46aa1d5591"
],
"version": "==1.14.3"
},
"chardet": {
"hashes": [
"sha256:84ab92ed1c4d4f16916e05906b6b75a6c0fb5db821cc65e70cbd64a3e2a5eaae",
"sha256:fc323ffcaeaed0e0a02bf4d117757b98aed530d9ed4531e3e15460124c106691"
],
"version": "==3.0.4"
},
"colorama": {
"hashes": [
"sha256:5941b2b48a20143d2267e95b1c2a7603ce057ee39fd88e7329b0c292aa16869b",
"sha256:9f47eda37229f68eee03b24b9748937c7dc3868f906e8ba69fbcbdd3bc5dc3e2"
],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
"version": "==0.4.4"
},
"cryptography": {
"hashes": [
"sha256:07ca431b788249af92764e3be9a488aa1d39a0bc3be313d826bbec690417e538",
"sha256:13b88a0bd044b4eae1ef40e265d006e34dbcde0c2f1e15eb9896501b2d8f6c6f",
"sha256:32434673d8505b42c0de4de86da8c1620651abd24afe91ae0335597683ed1b77",
"sha256:3cd75a683b15576cfc822c7c5742b3276e50b21a06672dc3a800a2d5da4ecd1b",
"sha256:4e7268a0ca14536fecfdf2b00297d4e407da904718658c1ff1961c713f90fd33",
"sha256:545a8550782dda68f8cdc75a6e3bf252017aa8f75f19f5a9ca940772fc0cb56e",
"sha256:55d0b896631412b6f0c7de56e12eb3e261ac347fbaa5d5e705291a9016e5f8cb",
"sha256:5849d59358547bf789ee7e0d7a9036b2d29e9a4ddf1ce5e06bb45634f995c53e",
"sha256:6dc59630ecce8c1f558277ceb212c751d6730bd12c80ea96b4ac65637c4f55e7",
"sha256:7117319b44ed1842c617d0a452383a5a052ec6aa726dfbaffa8b94c910444297",
"sha256:75e8e6684cf0034f6bf2a97095cb95f81537b12b36a8fedf06e73050bb171c2d",
"sha256:7b8d9d8d3a9bd240f453342981f765346c87ade811519f98664519696f8e6ab7",
"sha256:a035a10686532b0587d58a606004aa20ad895c60c4d029afa245802347fab57b",
"sha256:a4e27ed0b2504195f855b52052eadcc9795c59909c9d84314c5408687f933fc7",
"sha256:a733671100cd26d816eed39507e585c156e4498293a907029969234e5e634bc4",
"sha256:a75f306a16d9f9afebfbedc41c8c2351d8e61e818ba6b4c40815e2b5740bb6b8",
"sha256:bd717aa029217b8ef94a7d21632a3bb5a4e7218a4513d2521c2a2fd63011e98b",
"sha256:d25cecbac20713a7c3bc544372d42d8eafa89799f492a43b79e1dfd650484851",
"sha256:d26a2557d8f9122f9bf445fc7034242f4375bd4e95ecda007667540270965b13",
"sha256:d3545829ab42a66b84a9aaabf216a4dce7f16dbc76eb69be5c302ed6b8f4a29b",
"sha256:d3d5e10be0cf2a12214ddee45c6bd203dab435e3d83b4560c03066eda600bfe3",
"sha256:efe15aca4f64f3a7ea0c09c87826490e50ed166ce67368a68f315ea0807a20df"
],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
"version": "==3.2.1"
},
"docutils": {
"hashes": [
"sha256:0c5b78adfbf7762415433f5515cd5c9e762339e23369dbe8000d84a4bf4ab3af",
"sha256:c2de3a60e9e7d07be26b7f2b00ca0309c207e06c100f9cc2a94931fc75a478fc"
],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
"version": "==0.16"
},
"idna": {
"hashes": [
"sha256:b307872f855b18632ce0c21c5e45be78c0ea7ae4c15c828c20788b26921eb3f6",
"sha256:b97d804b1e9b523befed77c48dacec60e6dcb0b5391d57af6a65a312a90648c0"
],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
"version": "==2.10"
},
"importlib-metadata": {
"hashes": [
"sha256:77a540690e24b0305878c37ffd421785a6f7e53c8b5720d211b211de8d0e95da",
"sha256:cefa1a2f919b866c5beb7c9f7b0ebb4061f30a8a9bf16d609b000e2dfaceb9c3"
],
"markers": "python_version < '3.8'",
"version": "==2.0.0"
},
"jeepney": {
"hashes": [
"sha256:3479b861cc2b6407de5188695fa1a8d57e5072d7059322469b62628869b8e36e",
"sha256:d6c6b49683446d2407d2fe3acb7a368a77ff063f9182fe427da15d622adc24cf"
],
"markers": "sys_platform == 'linux'",
"version": "==0.4.3"
},
"keyring": {
"hashes": [
"sha256:4e34ea2fdec90c1c43d6610b5a5fafa1b9097db1802948e90caf5763974b8f8d",
"sha256:9aeadd006a852b78f4b4ef7c7556c2774d2432bbef8ee538a3e9089ac8b11466"
],
"markers": "python_version >= '3.6'",
"version": "==21.4.0"
},
"packaging": {
"hashes": [
"sha256:4357f74f47b9c12db93624a82154e9b120fa8293699949152b22065d556079f8",
"sha256:998416ba6962ae7fbd6596850b80e17859a5753ba17c32284f67bfff33784181"
],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
"version": "==20.4"
},
"pkginfo": {
"hashes": [
"sha256:a6a4ac943b496745cec21f14f021bbd869d5e9b4f6ec06918cffea5a2f4b9193",
"sha256:ce14d7296c673dc4c61c759a0b6c14bae34e34eb819c0017bb6ca5b7292c56e9"
],
"version": "==1.6.1"
},
"pycparser": {
"hashes": [
"sha256:2d475327684562c3a96cc71adf7dc8c4f0565175cf86b6d7a404ff4c771f15f0",
"sha256:7582ad22678f0fcd81102833f60ef8d0e57288b6b5fb00323d101be910e35705"
],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
"version": "==2.20"
},
"pygments": {
"hashes": [
"sha256:381985fcc551eb9d37c52088a32914e00517e57f4a21609f48141ba08e193fa0",
"sha256:88a0bbcd659fcb9573703957c6b9cff9fab7295e6e76db54c9d00ae42df32773"
],
"markers": "python_version >= '3.5'",
"version": "==2.7.2"
},
"pyparsing": {
"hashes": [
"sha256:c203ec8783bf771a155b207279b9bccb8dea02d8f0c9e5f8ead507bc3246ecc1",
"sha256:ef9d7589ef3c200abe66653d3f1ab1033c3c419ae9b9bdb1240a85b024efc88b"
],
"markers": "python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2, 3.3'",
"version": "==2.4.7"
},
"readme-renderer": {
"hashes": [
"sha256:267854ac3b1530633c2394ead828afcd060fc273217c42ac36b6be9c42cd9a9d",
"sha256:6b7e5aa59210a40de72eb79931491eaf46fefca2952b9181268bd7c7c65c260a"
],
"version": "==28.0"
},
"requests": {
"hashes": [
"sha256:b3559a131db72c33ee969480840fff4bb6dd111de7dd27c8ee1f820f4f00231b",
"sha256:fe75cc94a9443b9246fc7049224f75604b113c36acb93f87b80ed42c44cbb898"
],
"index": "pypi",
"version": "==2.24.0"
},
"requests-toolbelt": {
"hashes": [
"sha256:380606e1d10dc85c3bd47bf5a6095f815ec007be7a8b69c878507068df059e6f",
"sha256:968089d4584ad4ad7c171454f0a5c6dac23971e9472521ea3b6d49d610aa6fc0"
],
"version": "==0.9.1"
},
"rfc3986": {
"hashes": [
"sha256:112398da31a3344dc25dbf477d8df6cb34f9278a94fee2625d89e4514be8bb9d",
"sha256:af9147e9aceda37c91a05f4deb128d4b4b49d6b199775fd2d2927768abdc8f50"
],
"version": "==1.4.0"
},
"secretstorage": {
"hashes": [
"sha256:15da8a989b65498e29be338b3b279965f1b8f09b9668bd8010da183024c8bff6",
"sha256:b5ec909dde94d4ae2fa26af7c089036997030f0cf0a5cb372b4cccabd81c143b"
],
"markers": "sys_platform == 'linux'",
"version": "==3.1.2"
},
"six": {
"hashes": [
"sha256:30639c035cdb23534cd4aa2dd52c3bf48f06e5f4a941509c8bafd8ce11080259",
"sha256:8b74bedcbbbaca38ff6d7491d76f2b06b3592611af620f8426e82dddb04a5ced"
],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
"version": "==1.15.0"
},
"tqdm": {
"hashes": [
"sha256:9ad44aaf0fc3697c06f6e05c7cf025dd66bc7bcb7613c66d85f4464c47ac8fad",
"sha256:ef54779f1c09f346b2b5a8e5c61f96fbcb639929e640e59f8cf810794f406432"
],
"markers": "python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2, 3.3'",
"version": "==4.51.0"
},
"twine": {
"hashes": [
"sha256:34352fd52ec3b9d29837e6072d5a2a7c6fe4290e97bba46bb8d478b5c598f7ab",
"sha256:ba9ff477b8d6de0c89dd450e70b2185da190514e91c42cc62f96850025c10472"
],
"index": "pypi",
"version": "==3.2.0"
},
"urllib3": {
"hashes": [
"sha256:8d7eaa5a82a1cac232164990f04874c594c9453ec55eef02eab885aa02fc17a2",
"sha256:f5321fbe4bf3fefa0efd0bfe7fb14e90909eb62a48ccda331726b4319897dd5e"
],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_version < '4'",
"version": "==1.25.11"
},
"webencodings": {
"hashes": [
"sha256:a0af1213f3c2226497a97e2b3aa01a7e4bee4f403f95be16fc9acd2947514a78",
"sha256:b36a1c245f2d304965eb4e0a82848379241dc04b865afcc4aab16748587e1923"
],
"version": "==0.5.1"
},
"zipp": {
"hashes": [
"sha256:102c24ef8f171fd729d46599845e95c7ab894a4cf45f5de11a44cc7444fb1108",
"sha256:ed5eee1974372595f9e416cc7bbeeb12335201d8081ca8a0743c954d4446e5cb"
],
"markers": "python_version >= '3.6'",
"version": "==3.4.0"
}
}
}

@ -1,219 +0,0 @@
#!/usr/bin/env python3
"""A simple library to get data from scaruffi.com."""
import argparse
import logging
import re
from dataclasses import dataclass
from bs4 import BeautifulSoup, NavigableString
import requests
import log
LOG = None
SITE_URL = "https://scaruffi.com"
GENERAL_INDEX = SITE_URL + "/music/groups.html"
RATINGS_DECADES = SITE_URL + "/ratings/{:02}.html"
@dataclass
class Release:
title: str
artist: str = ""
year: int = 0 # Usually the release year, not the recording year.
def main():
parser = argparse.ArgumentParser()
parser.add_argument("-v", "--verbose", action="store_true",
help="Print debug logs")
parser.add_argument("-r", "--ratings", type=int,
help="Get ratings for a decade (e.g. 60)")
parser.add_argument("-m", "--musicians", action="store_true",
help="Get the list of musicians")
parser.add_argument("--offset", type=int, default=0,
help="Offset for paginated queries (default is 0)")
parser.add_argument("--limit", type=int, default=20,
help="Limit for paginated queries (default is 20)")
args = parser.parse_args()
log_level = logging.DEBUG if args.verbose else logging.WARNING
global LOG
LOG = log.get_logger("scaruffi", level=log_level)
if args.musicians:
musicians = get_musicians(args.offset, args.limit)
for musician in musicians:
print(musician)
elif args.ratings is not None:
ratings = get_ratings(args.ratings)
if ratings:
for rating, releases in ratings.items():
print(rating)
for rel in releases:
print(f"- {rel.artist} - {rel.title} ({rel.year})")
def _get_page(url):
LOG.debug(f"GET {url}")
try:
response = requests.get(url)
except requests.exceptions.RequestException as exc:
LOG.error(f"An exception occured during HTTP GET: {exc}")
return None
sc = response.status_code
if sc != 200:
LOG.error(f"Server returned HTTP response {sc} to {url}.")
return None
return response.text
def _get_soup(url):
html = _get_page(url)
if not html:
return None
return BeautifulSoup(html, "html5lib")
def get_musicians(offset=0, limit=20):
"""Get a list of musicians, or None on error."""
soup = _get_soup(GENERAL_INDEX)
if not soup:
return None
# Semantic Web? Just find the fattest table.
mu_table = max(soup.find_all("table"), key=lambda t: len(t.text))
musicians = [a_tag.text for a_tag in mu_table.find_all("a")]
return musicians[offset : offset + limit]
def get_ratings(decade):
"""Get a dict of ratings to a release list for this decade.
The decade must be an integer in the [0, 99] range, or a full year
(1960 for example). Returns None on error.
"""
if 1900 <= decade:
decade %= 100
if not (0 <= decade < 100 and decade % 10 == 0):
LOG.error(f"Invalid decade value: {decade}.")
return None
soup = _get_soup(RATINGS_DECADES.format(decade))
if not soup:
return None
ratings_table = max(soup.find_all("table"), key=lambda t: len(t.text))
num_lists = len(ratings_table("ul"))
if num_lists == 1:
return _get_ratings_from_unique_list(ratings_table.ul)
else:
return _get_ratings_from_lists(ratings_table("ul"))
def _get_ratings_from_unique_list(messy_list):
"""Get ratings from decades where one list contains all ratings."""
ratings = {}
current_key = None
for tag in messy_list:
if isinstance(tag, NavigableString):
continue
# Get an entry for the current rating.
if tag.name == "li":
release = _parse_release(tag.text)
if not current_key:
LOG.critical(f"Found release {release} without rating.")
return None
ratings[current_key].append(release)
# Detect a new rating list.
# Do it after getting entries in tag due to bad HTML.
text = tag.text.strip()
if text:
rating = _match_rating(text.split()[-1])
if rating is not None:
current_key = rating
ratings[current_key] = []
return ratings
def _get_ratings_from_lists(lists):
"""Get ratings from several lists, one per rating."""
ratings = {}
for ul in lists:
rating_tag = ul.span
if rating_tag:
rating = _match_rating(rating_tag.text)
if rating is None:
LOG.critical("Failed to find rating tag in list.")
return None
releases = [_parse_release(li.text) for li in ul("li")]
ratings[rating] = releases
return ratings
RATING_RE = re.compile(r"\s*(\d(.\d)?)/10\s*")
def _match_rating(text):
"""Try to match text as a rating and return the rating, or None."""
if not text.strip():
return None
match = RATING_RE.match(text.strip())
if match:
return float(match.group(1))
def _parse_release(entry):
"""Fill a release fields using entry, as well as we can."""
entry = entry.strip("\r\n :") # Remove bogus spaces and colons.
parts = entry.split(": ")
if len(parts) == 1:
LOG.info(f"No colon in {entry}, using both as artist and title.")
title_and_year = _parse_release_title_year(entry)
if not title_and_year:
return Release(title=entry)
title, year = title_and_year
artist = title
else:
# Usual case is 2 parts ("artist: title"), but in case one of them
# contains ": " as well, assume that it is part of the title, not the
# artist name.
artist = parts[0]
title_and_year_str = parts[1].strip()
if len(parts) > 2:
title_and_year_str += ": " + ": ".join(parts[2:])
title_and_year = _parse_release_title_year(title_and_year_str)
if not title_and_year:
return Release(artist=artist, title=title_and_year_str)
title, year = title_and_year
return Release(artist=artist, title=title, year=year)
RATING_TITLE_AND_YEAR_RE = re.compile(r"(.+?)\s?\((\d{4})(?:-\d+)?\)")
def _parse_release_title_year(title_and_year):
"""Parse title and year in the approximate "title (year)" format.
In some instances, the year is actually a range of years, in the YYYY-YY
format. Sometimes there is no space between title and year."""
match = RATING_TITLE_AND_YEAR_RE.match(title_and_year)
if not match:
LOG.error(f"Failed to split title and year in \"{title_and_year}\".")
return None
groups = match.groups()
if len(groups) != 2 or None in groups:
LOG.error(f"Failed to parse title and year in \"{title_and_year}\".")
return None
title, year = groups
try:
year = int(year)
except ValueError:
LOG.error(f"Failed to parse year string \"{year}\" as an integer.")
year = 0
return title, year
if __name__ == "__main__":
main()

@ -0,0 +1,3 @@
[build-system]
requires = ["setuptools >= 38.3.0", "wheel"]
build-backend = "setuptools.build_meta"

@ -0,0 +1,42 @@
#!/usr/bin/env python3
"""A simple library to get data from scaruffi.com."""
import argparse
import logging
from scaruffi import api
def main():
parser = argparse.ArgumentParser()
parser.add_argument("-v", "--verbose", action="store_true",
help="Print debug logs")
parser.add_argument("-r", "--ratings", type=int,
help="Get ratings for a decade (e.g. 60)")
parser.add_argument("-m", "--musicians", action="store_true",
help="Get the list of musicians")
parser.add_argument("--offset", type=int, default=0,
help="Offset for paginated queries (default is 0)")
parser.add_argument("--limit", type=int, default=20,
help="Limit for paginated queries (default is 20)")
args = parser.parse_args()
log_level = logging.DEBUG if args.verbose else logging.WARNING
global LOG
LOG = api.setup_logging("scaruffi", level=log_level)
if args.musicians:
musicians = api.get_musicians(args.offset, args.limit)
for musician in musicians:
print(musician)
elif args.ratings is not None:
ratings = api.get_ratings(args.ratings)
if ratings:
for rating, releases in ratings.items():
print(rating)
for rel in releases:
print(f"- {rel.artist} - {rel.title} ({rel.year})")
if __name__ == "__main__":
main()

@ -0,0 +1,172 @@
import logging
import re
from dataclasses import dataclass
from bs4 import BeautifulSoup, NavigableString
import requests
import scaruffi.log
SITE_URL = "https://scaruffi.com"
GENERAL_INDEX = SITE_URL + "/music/groups.html"
RATINGS_DECADES = SITE_URL + "/ratings/{:02}.html"
@dataclass
class Release:
title: str
artist: str = ""
year: int = 0 # Usually the release year, not the recording year.
class ScaruffiApi:
def __init__(self, log_level=logging.WARNING):
self.log = scaruffi.log.get_logger("scaruffi", level=log_level)
def _get_soup(self, url):
html = self._get_page(url)
if not html:
return None
return BeautifulSoup(html, "html5lib")
def _get_page(self, url):
self.log.debug(f"GET {url}")
try:
response = requests.get(url)
except requests.exceptions.RequestException as exc:
self.log.error(f"An exception occured during HTTP GET: {exc}")
return None
sc = response.status_code
if sc != 200:
self.log.error(f"Server returned HTTP response {sc} to {url}.")
return None
return response.text
def get_musicians(self, offset=0, limit=20):
"""Get a list of musicians, or None on error."""
soup = self._get_soup(GENERAL_INDEX)
if not soup:
return None
# Semantic Web? Just find the fattest table.
mu_table = max(soup.find_all("table"), key=lambda t: len(t.text))
musicians = [a_tag.text for a_tag in mu_table.find_all("a")]
return musicians[offset : offset + limit]
def get_ratings(self, decade):
"""Get a dict of ratings to a release list for this decade.
The decade must be an integer in the [0, 99] range, or a full year
(1960 for example). Returns None on error.
"""
if 1900 <= decade:
decade %= 100
if not (0 <= decade < 100 and decade % 10 == 0):
self.log.error(f"Invalid decade value: {decade}.")
return None
soup = self._get_soup(RATINGS_DECADES.format(decade))
if not soup:
return None
ratings_table = max(soup.find_all("table"), key=lambda t: len(t.text))
num_lists = len(ratings_table("ul"))
if num_lists == 1:
return self._get_ratings_from_unique_list(ratings_table.ul)
else:
return self._get_ratings_from_lists(ratings_table("ul"))
def _get_ratings_from_unique_list(self, messy_list):
"""Get ratings from decades where one list contains all ratings."""
ratings = {}
current_key = None
for tag in messy_list:
if isinstance(tag, NavigableString):
continue
# Get an entry for the current rating.
if tag.name == "li":
release = self._parse_release(tag.text)
if not current_key:
self.log.critical(f"Release {release} without rating.")
return None
ratings[current_key].append(release)
# Detect a new rating list.
# Do it after getting entries in tag due to bad HTML.
text = tag.text.strip()
if text:
rating = self._match_rating(text.split()[-1])
if rating is not None:
current_key = rating
ratings[current_key] = []
return ratings
def _get_ratings_from_lists(self, lists):
"""Get ratings from several lists, one per rating."""
ratings = {}
for ul in lists:
rating_tag = ul.span
if rating_tag:
rating = self._match_rating(rating_tag.text)
if rating is None:
self.log.critical("Failed to find rating tag in list.")
return None
releases = [self._parse_release(li.text) for li in ul("li")]
ratings[rating] = releases
return ratings
RATING_RE = re.compile(r"\s*(\d(.\d)?)/10\s*")
def _match_rating(self, text):
"""Try to match text as a rating and return the rating, or None."""
if not text.strip():
return None
match = self.RATING_RE.match(text.strip())
if match:
return float(match.group(1))
def _parse_release(self, entry):
"""Fill a release fields using entry, as well as we can."""
entry = entry.strip("\r\n :") # Remove bogus spaces and colons.
parts = entry.split(": ")
if len(parts) == 1:
self.log.info(f"No colon in {entry}, using both as artist & title.")
title_and_year = self._parse_release_title_year(entry)
if not title_and_year:
return Release(title=entry)
title, year = title_and_year
artist = title
else:
# Usual case is 2 parts ("artist: title"), but in case one of them
# contains ": " as well, assume that it is part of the title, not
# the artist name.
artist = parts[0]
title_and_year_str = parts[1].strip()
if len(parts) > 2:
title_and_year_str += ": " + ": ".join(parts[2:])
title_and_year = self._parse_release_title_year(title_and_year_str)
if not title_and_year:
return Release(artist=artist, title=title_and_year_str)
title, year = title_and_year
return Release(artist=artist, title=title, year=year)
RATING_TITLE_AND_YEAR_RE = re.compile(r"(.+?)\s?\((\d{4})(?:-\d+)?\)")
def _parse_release_title_year(self, title_year):
"""Parse title and year in the approximate "title (year)" format.
In some instances, the year is actually a range of years, in the YYYY-YY
format. Sometimes there is no space between title and year."""
match = self.RATING_TITLE_AND_YEAR_RE.match(title_year)
if not match:
self.log.error(f"Failed to split title/year in \"{title_year}\".")
return None
groups = match.groups()
if len(groups) != 2 or None in groups:
self.log.error(f"Failed to parse title/year in \"{title_year}\".")
return None
title, year = groups
try:
year = int(year)
except ValueError:
self.log.error(f"Failed to parse \"{year}\" as an integer.")
year = 0
return title, year

@ -0,0 +1,25 @@
import logging
import unittest
from scaruffi.api import ScaruffiApi
class TestScaruffi(unittest.TestCase):
def setUp(self):
self.api = ScaruffiApi()
def tearDown(self):
self.api = None
def test_get_musicians(self):
musicians = self.api.get_musicians()
self.assertEqual(len(musicians), 20)
def test_get_ratings(self):
self.assertIsNotNone(self.api.get_ratings(1960))
self.assertIsNotNone(self.api.get_ratings(1970))
self.assertIsNotNone(self.api.get_ratings(1980))
self.assertIsNotNone(self.api.get_ratings(1990))
self.assertIsNotNone(self.api.get_ratings(2000))
self.assertIsNotNone(self.api.get_ratings(2010))

@ -0,0 +1,23 @@
[metadata]
name = scaruffi
version = 0.0.1
description = Get some data from scaruffi.com.
long_description = file: README.md
license = MIT
author = dece
author-email = shgck@pistache.land
home-page = https://github.com/Dece/Scaruffi
classifiers =
Environment :: Console
License :: OSI Approved :: MIT License
Programming Language :: Python :: 3
Programming Language :: Python :: 3.7
[options]
packages = scaruffi
python_requires = >= 3.7
setup_requires = setuptools >= 38.3.0
[options.entry_points]
console_scripts =
scaruffi = scaruffi.__main__:main

@ -0,0 +1,2 @@
from setuptools import setup
setup()
Loading…
Cancel
Save