cassandra - origin url hashing encoding issue
The origin
, origin_visit
and origin_visit_status
replayers often failed with this stack trace :
Traceback (most recent call last):
File "/usr/bin/swh", line 11, in <module>
load_entry_point('swh.core==0.14.3', 'console_scripts', 'swh')()
File "/usr/lib/python3/dist-packages/swh/core/cli/__init__.py", line 185, in main
return swh(auto_envvar_prefix="SWH")
File "/usr/lib/python3/dist-packages/click/core.py", line 764, in __call__
return self.main(*args, **kwargs)
File "/usr/lib/python3/dist-packages/click/core.py", line 717, in main
rv = self.invoke(ctx)
File "/usr/lib/python3/dist-packages/click/core.py", line 1137, in invoke
return _process_result(sub_ctx.command.invoke(sub_ctx))
File "/usr/lib/python3/dist-packages/click/core.py", line 1137, in invoke
return _process_result(sub_ctx.command.invoke(sub_ctx))
File "/usr/lib/python3/dist-packages/click/core.py", line 956, in invoke
return ctx.invoke(self.callback, **ctx.params)
File "/usr/lib/python3/dist-packages/click/core.py", line 555, in invoke
return callback(*args, **kwargs)
File "/usr/lib/python3/dist-packages/click/decorators.py", line 17, in new_func
return f(get_current_context(), *args, **kwargs)
File "/usr/lib/python3/dist-packages/swh/storage/cli.py", line 194, in replay
client.process(worker_fn)
File "/usr/lib/python3/dist-packages/swh/journal/client.py", line 265, in process
batch_processed, at_eof = self.handle_messages(messages, worker_fn)
File "/usr/lib/python3/dist-packages/swh/journal/client.py", line 292, in handle_messages
worker_fn(dict(objects))
File "/usr/lib/python3/dist-packages/swh/storage/replay.py", line 62, in process_replay_objects
_insert_objects(object_type, objects, storage)
File "/usr/lib/python3/dist-packages/swh/storage/replay.py", line 144, in _insert_objects
storage.origin_add(origins)
File "/usr/lib/python3/dist-packages/swh/storage/cassandra/storage.py", line 977, in origin_add
origins = [ori for ori in to_add if self.origin_get_one(ori.url) is None]
File "/usr/lib/python3/dist-packages/swh/storage/cassandra/storage.py", line 977, in <listcomp>
origins = [ori for ori in to_add if self.origin_get_one(ori.url) is None]
File "/usr/lib/python3/dist-packages/swh/storage/cassandra/storage.py", line 877, in origin_get_one
rows = list(self._cql_runner.origin_get_by_url(origin_url))
File "/usr/lib/python3/dist-packages/swh/storage/cassandra/cql.py", line 783, in origin_get_by_url
return self.origin_get_by_sha1(hash_url(url))
File "/usr/lib/python3/dist-packages/swh/storage/cassandra/common.py", line 16, in hash_url
return hashlib.sha1(url.encode("ascii")).digest()
UnicodeEncodeError: 'ascii' codec can't encode character '\u212a' in position 22: ordinal not in range(128)
another example:
File "/usr/lib/python3/dist-packages/swh/storage/cassandra/common.py", line 16, in hash_url
return hashlib.sha1(url.encode("ascii")).digest()
UnicodeEncodeError: 'ascii' codec can't encode characters in position 41-43: ordinal not in range(128)
Migrated from T3394 (view on Phabricator)
Edited by Vincent Sellier