diff --git a/src/borg/locking.py b/src/borg/locking.py index a90fbcfd8..b14b50ade 100644 --- a/src/borg/locking.py +++ b/src/borg/locking.py @@ -130,8 +130,7 @@ class ExclusiveLock: except FileExistsError: # already locked if self.by_me(): return self - if self.kill_stale_lock(): - pass + self.kill_stale_lock() if timer.timed_out_or_sleep(): raise LockTimeout(self.path) except OSError as err: @@ -167,7 +166,7 @@ class ExclusiveLock: # It's safer to just exit return False - if not platform.process_alive(host, pid, thread): + if platform.process_alive(host, pid, thread): return False if not self.ok_to_kill_stale_locks: @@ -224,17 +223,17 @@ class LockRoster: # Just nuke the stale locks early on load if self.ok_to_kill_zombie_locks: for key in (SHARED, EXCLUSIVE): - elements = set() try: - for e in data[key]: - (host, pid, thread) = e - if not platform.process_alive(host, pid, thread): - elements.add(tuple(e)) - else: - logger.warning('Removed stale %s roster lock for pid %d.', key, pid) - data[key] = list(list(e) for e in elements) + entries = data[key] except KeyError: - pass + continue + elements = set() + for host, pid, thread in entries: + if platform.process_alive(host, pid, thread): + elements.add((host, pid, thread)) + else: + logger.warning('Removed stale %s roster lock for pid %d.', key, pid) + data[key] = list(elements) except (FileNotFoundError, ValueError): # no or corrupt/empty roster file? data = {} diff --git a/src/borg/platform/posix.pyx b/src/borg/platform/posix.pyx index 1351254a7..e64dc5d8c 100644 --- a/src/borg/platform/posix.pyx +++ b/src/borg/platform/posix.pyx @@ -35,18 +35,23 @@ def get_process_id(): def process_alive(host, pid, thread): - """Check if the (host, pid, thread_id) combination corresponds to a dead process on our local node or not.""" + """ + Check if the (host, pid, thread_id) combination corresponds to a potentially alive process. + + If the process is local, then this will be accurate. If the process is not local, then this + returns always True, since there is no real way to check. + """ from . import local_pid_alive if host != _hostname: - return False + return True if thread != 0: # Currently thread is always 0, if we ever decide to set this to a non-zero value, # this code needs to be revisited, too, to do a sensible thing - return False + return True - return local_pid_alive + return local_pid_alive(pid) def local_pid_alive(pid): """Return whether *pid* is alive.""" @@ -62,4 +67,4 @@ def local_pid_alive(pid): # ESRCH = no such process return False # Any other error (eg. permissions) mean that the process ID refers to a live process - return True \ No newline at end of file + return True diff --git a/src/borg/testsuite/locking.py b/src/borg/testsuite/locking.py index 850c0ac56..fe8676d4a 100644 --- a/src/borg/testsuite/locking.py +++ b/src/borg/testsuite/locking.py @@ -1,22 +1,25 @@ +import random import time import pytest -from ..locking import get_id, TimeoutTimer, ExclusiveLock, Lock, LockRoster, \ - ADD, REMOVE, SHARED, EXCLUSIVE, LockTimeout - +from ..platform import get_process_id, process_alive +from ..locking import TimeoutTimer, ExclusiveLock, Lock, LockRoster, \ + ADD, REMOVE, SHARED, EXCLUSIVE, LockTimeout, NotLocked, NotMyLock ID1 = "foo", 1, 1 ID2 = "bar", 2, 2 -def test_id(): - hostname, pid, tid = get_id() - assert isinstance(hostname, str) - assert isinstance(pid, int) - assert isinstance(tid, int) - assert len(hostname) > 0 - assert pid > 0 +@pytest.fixture() +def free_pid(): + """Return a free PID not used by any process (naturally this is racy)""" + host, pid, tid = get_process_id() + while True: + # PIDs are often restricted to a small range. On Linux the range >32k is by default not used. + pid = random.randint(33000, 65000) + if not process_alive(host, pid, tid): + return pid class TestTimeoutTimer: @@ -57,6 +60,22 @@ class TestExclusiveLock: with pytest.raises(LockTimeout): ExclusiveLock(lockpath, id=ID2, timeout=0.1).acquire() + def test_kill_stale(self, lockpath, free_pid): + host, pid, tid = our_id = get_process_id() + dead_id = host, free_pid, tid + cant_know_if_dead_id = 'foo.bar.example.net', 1, 2 + + dead_lock = ExclusiveLock(lockpath, id=dead_id).acquire() + with ExclusiveLock(lockpath, id=our_id, kill_stale_locks=True): + with pytest.raises(NotMyLock): + dead_lock.release() + with pytest.raises(NotLocked): + dead_lock.release() + + with ExclusiveLock(lockpath, id=cant_know_if_dead_id): + with pytest.raises(LockTimeout): + ExclusiveLock(lockpath, id=our_id, kill_stale_locks=True, timeout=0.1).acquire() + class TestLock: def test_shared(self, lockpath): @@ -117,6 +136,25 @@ class TestLock: with pytest.raises(LockTimeout): Lock(lockpath, exclusive=True, id=ID2, timeout=0.1).acquire() + def test_kill_stale(self, lockpath, free_pid): + host, pid, tid = our_id = get_process_id() + dead_id = host, free_pid, tid + cant_know_if_dead_id = 'foo.bar.example.net', 1, 2 + + dead_lock = Lock(lockpath, id=dead_id, exclusive=True).acquire() + roster = dead_lock._roster + with Lock(lockpath, id=our_id, kill_stale_locks=True): + assert roster.get(EXCLUSIVE) == set() + assert roster.get(SHARED) == {our_id} + assert roster.get(EXCLUSIVE) == set() + assert roster.get(SHARED) == set() + with pytest.raises(KeyError): + dead_lock.release() + + with Lock(lockpath, id=cant_know_if_dead_id, exclusive=True): + with pytest.raises(LockTimeout): + Lock(lockpath, id=our_id, kill_stale_locks=True, timeout=0.1).acquire() + @pytest.fixture() def rosterpath(tmpdir): @@ -144,3 +182,28 @@ class TestLockRoster: roster2 = LockRoster(rosterpath, id=ID2) roster2.modify(SHARED, REMOVE) assert roster2.get(SHARED) == set() + + def test_kill_stale(self, rosterpath, free_pid): + host, pid, tid = our_id = get_process_id() + dead_id = host, free_pid, tid + + roster1 = LockRoster(rosterpath, id=dead_id) + assert roster1.get(SHARED) == set() + roster1.modify(SHARED, ADD) + assert roster1.get(SHARED) == {dead_id} + + cant_know_if_dead_id = 'foo.bar.example.net', 1, 2 + roster1 = LockRoster(rosterpath, id=cant_know_if_dead_id) + assert roster1.get(SHARED) == {dead_id} + roster1.modify(SHARED, ADD) + assert roster1.get(SHARED) == {dead_id, cant_know_if_dead_id} + + killer_roster = LockRoster(rosterpath, kill_stale_locks=True) + # Did kill the dead processes lock (which was alive ... I guess?!) + assert killer_roster.get(SHARED) == {cant_know_if_dead_id} + killer_roster.modify(SHARED, ADD) + assert killer_roster.get(SHARED) == {our_id, cant_know_if_dead_id} + + other_killer_roster = LockRoster(rosterpath, kill_stale_locks=True) + # Did not kill us, since we're alive + assert other_killer_roster.get(SHARED) == {our_id, cant_know_if_dead_id} diff --git a/src/borg/testsuite/platform.py b/src/borg/testsuite/platform.py index 9bd81d2b4..0ae1458ae 100644 --- a/src/borg/testsuite/platform.py +++ b/src/borg/testsuite/platform.py @@ -1,5 +1,6 @@ import functools import os +import random import shutil import sys import tempfile @@ -7,7 +8,9 @@ import pwd import unittest from ..platform import acl_get, acl_set, swidth +from ..platform import get_process_id, process_alive from . import BaseTestCase, unopened_tempfile +from .locking import free_pid ACCESS_ACL = """ @@ -186,3 +189,22 @@ class PlatformPosixTestCase(BaseTestCase): def test_swidth_mixed(self): self.assert_equal(swidth("borgバックアップ"), 4 + 6 * 2) + + +def test_process_alive(free_pid): + id = get_process_id() + assert process_alive(*id) + host, pid, tid = id + assert process_alive(host + 'abc', pid, tid) + assert process_alive(host, pid, tid + 1) + assert not process_alive(host, free_pid, tid) + + +def test_process_id(): + hostname, pid, tid = get_process_id() + assert isinstance(hostname, str) + assert isinstance(pid, int) + assert isinstance(tid, int) + assert len(hostname) > 0 + assert pid > 0 + assert get_process_id() == (hostname, pid, tid)