diff options
author | Peter Bex <peter@more-magic.net> | 2025-07-29 14:52:12 +0200 |
---|---|---|
committer | Peter Bex <peter@more-magic.net> | 2025-08-05 09:41:44 +0200 |
commit | 84a9205579a6d069a40d23b4bc815bc1a50b8181 (patch) | |
tree | 82a7986e4f60a5d222ff0ceefbe6f199c57843e5 | |
download | slsets-84a9205579a6d069a40d23b4bc815bc1a50b8181.tar.gz |
Initial implementation of slsets egg
This implements sets of symbols *as lists*, like in the srfi-1 lset
operations. It maintains the benefit of using lists for everything,
but not having to pay the quadratic performance penalty that srfi-1
entails. We achieve this by using plist operations to "mark" items
so we have to visit the list "set" only a fixed number of times.
-rw-r--r-- | benchmarks/benchmark.scm | 110 | ||||
-rw-r--r-- | slsets.egg | 8 | ||||
-rw-r--r-- | slsets.scm | 155 | ||||
-rw-r--r-- | tests/run.scm | 100 |
4 files changed, 373 insertions, 0 deletions
diff --git a/benchmarks/benchmark.scm b/benchmarks/benchmark.scm new file mode 100644 index 0000000..21d3c43 --- /dev/null +++ b/benchmarks/benchmark.scm @@ -0,0 +1,110 @@ +;; +;; Simple benchmarking program to compare set-difference on hash +;; tables, lsets and slsets. +;; +;; Note that the tests for slsets is severely disadvantaged because +;; the hash tables are created with the size of the set, which +;; normally would not be the case because you can't predict ahead of +;; time how big it would get. Also, the symbols have a short string +;; value which makes hashing fast. Slsets don't care about the symbol +;; size and don't need a preallocated size. + +(import (chicken fixnum) (chicken string) (chicken process-context) srfi-1 srfi-69 slsets) + +(unless (= (length (command-line-arguments)) 2) + (error "Need impl name (one of lset, slset, symbolset or srfi-69) and set size")) + +(define impl (string->symbol (car (command-line-arguments)))) +(define size (string->number (cadr (command-line-arguments)))) + +(define-inline (list-tabulate n fun) + (let lp ((i 0) + (res '())) + (if (= i n) + res + (lp (add1 i) + (cons (fun i) res))))) + +(define lst (list-tabulate size (lambda (i) (string->symbol (conc "x" i))))) +(define lst (list-tabulate size (lambda (i) (string->symbol (conc "y" i))))) + +(define-inline (filter pred lst) + (let lp ((lst lst) + (res '())) + (cond ((null? lst) (reverse res)) + ((not (pred (car lst))) (lp (cdr lst) res)) + (else (lp (cdr lst) (cons (car lst) res)))))) + +;;;;;;;;;;;; +;;; Quick and dirty hash table as used in CHICKEN internally +;;;;;;;;;;;; + +(define hash-symbol + (let ((cache-s #f) + (cache-h #f) + ;; NOTE: All low-level hash tables share the same randomization factor + (rand (##core#inline "C_rand" #x10000))) + (lambda (s n) + (if (eq? s cache-s) + (##core#inline "C_fixnum_modulo" cache-h n) + (let ((bv (##sys#slot s 1))) + (set! cache-s s) + (set! cache-h (##core#inline "C_u_i_bytevector_hash" bv 0 (fx- (##sys#size bv) 1) rand)) + (##core#inline "C_fixnum_modulo" cache-h n)))))) + +(define (make-my-hash-table #!optional (size 301)) + (make-vector size '())) + +(define (my-hash-table-ref ht key) + (let loop ((bucket (##sys#slot ht (hash-symbol key (##sys#size ht))))) + (and (not (eq? '() bucket)) + (if (eq? key (##sys#slot (##sys#slot bucket 0) 0)) + (##sys#slot (##sys#slot bucket 0) 1) + (loop (##sys#slot bucket 1)))))) + +(define (my-hash-table-set! ht key val) + (let* ((k (hash-symbol key (##sys#size ht))) + (ib (##sys#slot ht k))) + (let loop ((bucket ib)) + (if (eq? '() bucket) + (##sys#setslot ht k (cons (cons key val) ib)) + (if (eq? key (##sys#slot (##sys#slot bucket 0) 0)) + (##sys#setslot (##sys#slot bucket 0) 1 val) + (loop (##sys#slot bucket 1))))))) + +(define (symbolset-difference ls . lss) + (let ((seen (make-my-hash-table size))) + (for-each (lambda (lst) + (for-each (lambda (x) (my-hash-table-set! seen x #t)) lst)) + lss) + (filter (lambda (x) (not (my-hash-table-ref seen x))) ls))) + +;;;;;;;;;;;; + +;; For comparison, a bit strange +(define (srfi-69-hash-table-difference ht1 ht2) + (let ((result (hash-table-copy ht1))) + (hash-table-walk ht2 (lambda (k v) + (hash-table-delete! result k))) + result)) + +;;;;;;;;;;;; + +(let lp ((i 10000)) + (case impl + ((lset) + (lset-difference eq? lst (cons 'aha lst))) + ((slset) + (slset-difference lst (cons 'aha lst))) + ((symbolset) + (symbolset-difference lst (cons 'aha lst))) + ((srfi-69) + (when (pair? lst) + (let ((ht (make-hash-table eq? symbol-hash size))) + (begin + (for-each (lambda (x) (hash-table-set! ht x #t)) lst) + (set! lst ht)))) + (srfi-69-hash-table-difference lst lst)) + (else (error "Unknown impl" impl))) + (unless (zero? i) + (lp (sub1 i)))) diff --git a/slsets.egg b/slsets.egg new file mode 100644 index 0000000..fd2b6f3 --- /dev/null +++ b/slsets.egg @@ -0,0 +1,8 @@ +;;; slsets.egg -*- Scheme -*- + +((synopsis "Lists of symbols as sets") + (author "Peter Bex") + (category data) + (license "BSD") + (test-dependencies test) + (components (extension slsets (csc-options "-O3")))) diff --git a/slsets.scm b/slsets.scm new file mode 100644 index 0000000..aef229f --- /dev/null +++ b/slsets.scm @@ -0,0 +1,155 @@ +;; +;; Lists of symbols as sets +;; +;; Essentially, this provides specialized versions of SRFI-1's lset +;; operations for symbols but with linear instead of quadratic time +;; complexity. It's even faster than using a side hash table, unless +;; the symbols already have large plists associated with them. +;; +(module slsets + (slset<= slset= slset-adjoin ;; TODO: slset-delete and slset-contains? + slset-difference slset-intersection + slset-difference+intersection + slset-union slset-xor + slset-deduplicate) + +(import scheme (chicken base) (chicken foreign) (chicken plist)) + +(define-inline (mark! x marking) + (when (symbol? x) (put! x marking #t))) + +(define-inline (mark-list! lst marking) + (for-each (lambda (x) (mark! x marking)) lst)) + +(define-inline (unmark! x marking) + (when (symbol? x) (remprop! x marking))) + +(define-inline (unmark-list! lst marking) + (for-each (lambda (x) (unmark! x marking)) lst)) + +(define-inline (with-marked-list lst fun) + (let ((marking (gensym 'm))) + (dynamic-wind + (lambda () (mark-list! lst marking)) + (lambda () (fun marking)) + (lambda () (unmark-list! lst marking))))) + +(define-inline (every? pred lst) + (let lp ((lst lst)) + (cond ((null? lst) #t) + ((not (pred (car lst))) #f) + (else (lp (cdr lst)))))) + +(define-inline (marked? x marking) + (and (symbol? x) (get x marking))) + +(define (last lst) + (if (null? (cdr lst)) + (car lst) + (last (cdr lst)))) + +(define-inline (filter pred lst) + (let lp ((lst lst) + (res '())) + (cond ((null? lst) (reverse res)) + ((not (pred (car lst))) (lp (cdr lst) res)) + (else (lp (cdr lst) (cons (car lst) res)))))) + +(define (slset<= . lsts) + (or (null? lsts) + (let ((last-list (last lsts))) + (with-marked-list last-list + (lambda (m) + (every? (lambda (lst) + (or (eq? lst last-list) ; Avoid filtering out this one and building a new list + (every? (lambda (x) (marked? x m)) lst))) + lsts)))))) + +(define (slset= . lsts) + (or (null? lsts) + (let ((lst (car lsts)) + (lsts (cdr lsts))) + (every? (lambda (l) + (and (slset<= lst l) (slset<= l lst))) + lsts)))) + +(define (slset-adjoin lst . els) + (with-marked-list lst + (lambda (m) + (let lp ((lst lst) + (els els)) + (cond ((null? els) lst) + ((marked? (car els) m) (lp lst (cdr els))) + (else (mark! (car els) m) + (lp (cons (car els) lst) (cdr els)))))))) + +(define (slset-difference lst . lsts) + (let lp ((lsts lsts) + (res lst)) + (if (null? lsts) + res + (lp (cdr lsts) + (with-marked-list (car lsts) (lambda (m) (filter (lambda (x) (not (marked? x m))) res))))))) + +(define (slset-intersection lst . lsts) + (let lp ((lsts lsts) + (res lst)) + (if (null? lsts) + res + (lp (cdr lsts) + (with-marked-list (car lsts) + (lambda (m) (filter (lambda (x) (marked? x m)) res))))))) + +;; TODO: Make this faster +(define (slset-difference+intersection lst . lsts) + (values (apply slset-difference lst lsts) + (apply slset-intersection lst lsts))) + +;; NOTE: Can't use mark-list here, so roll our own! +;; TODO: Clean up if there's an error cdring down any of these lists? +(define (slset-union . lsts) + (if (null? lsts) + '() + (let ((marking (gensym 'm)) + (first-list (car lsts))) + (mark-list! first-list marking) + (let lp ((lsts (cdr lsts)) + (res first-list)) + (if (null? lsts) + (begin (unmark-list! res marking) + res) + (let lp2 ((lst (car lsts)) + (res res)) + (if (null? lst) + (lp (cdr lsts) res) + (let ((x (car lst))) + (if (marked? x marking) + (lp2 (cdr lst) res) + (begin (mark! x marking) + (lp2 (cdr lst) (cons x res)))))))))))) + +(define (slset-xor . lsts) + (let lp ((lsts lsts) + (res '())) + (if (null? lsts) + res + (lp (cdr lsts) + (append (slset-difference res (car lsts)) + (slset-difference (car lsts) res)))))) + +;; NOTE: Can't use mark-list here, so roll our own! +;; TODO: Clean up if there's an error cdring down the list? +(define (slset-deduplicate lst) + (let ((marking (gensym 'm))) + (let lp ((lst lst) + (res '())) + (if (null? lst) + (begin (unmark-list! res marking) + (reverse res)) + (let ((x (car lst))) + (cond ((marked? x marking) + (lp (cdr lst) res)) + (else (mark! x marking) + (lp (cdr lst) (cons x res))))))))) + +) diff --git a/tests/run.scm b/tests/run.scm new file mode 100644 index 0000000..06f4f2c --- /dev/null +++ b/tests/run.scm @@ -0,0 +1,100 @@ +;; (load "../slsets.scm") +(import slsets test) + +(test-group "slsets" + (test-group "slset<=" + (test-group "examples from srfi-1 document" + (test-assert (slset<= '(a) '(a b a) '(a b c c))) + (test-assert "trivial case no args" (slset<=)) + (test-assert "trivial case one arg" (slset<= '(a)))) + (test-group "negatives" + (test-assert (not (slset<= '(a b a) '(a b c c) '(a)))) + (test-assert (not (slset<= '(a b) '(a c)))))) + + (test-group "slset=" + (test-group "examples from srfi-1 document" + (test-assert (slset= '(b e a) '(a e b) '(e e b a))) + (test-assert "trivial case no args" (slset=)) + (test-assert "trivial case one arg" (slset= '(a)))) + (test-group "negatives" + (test-assert (not (slset= '(b e a) '(a e b x) '(e e b a)))) + (test-assert (not (slset= '(b e a x) '(a e b) '(e e b a)))) + (test-assert (not (slset= '(b e a) '(a e b) '(e e b a x)))))) + + (test-group "slset-adjoin" + (test-group "example from srfi-1 document" + (test '(u o i a b c d c e) + (slset-adjoin '(a b c d c e) 'a 'e 'i 'o 'u))) + ;; Spec says it should, so check it + (test-group "result shares a common tail with the list argument" + (let* ((lst '(a b c d c e)) + (result (slset-adjoin lst 'a 'x 'e 'b))) + (test-assert (eq? lst (cdr result)))))) + + (test-group "slset-union" + (test-group "examples from srfi-1 document" + (test '(u o i a b c d e) (slset-union '(a b c d e) '(a e i o u))) + (test "Repeated elements in LIST1 are preserved" + '(x a a c) + (slset-union '(a a c) '(x a x))) + (test "trivial case no args" + '() + (slset-union)) + (test "trivial case one arg" + '(a b c) + (slset-union '(a b c))))) + + (test-group "slset-xor" + (test-group "examples from srfi-1 document" + ;;(test '(d c b i o u) (slset-xor '(a b c d e) '(a e i o u))) + ;; NOTE: The above has a peculiar ordering, but there's nothing + ;; in the spec guaranteeing it should be like that. We return this: + (test '(b c d i o u) (slset-xor '(a b c d e) '(a e i o u))) + (test "trivial case no args" + '() (slset-xor)) + (test "trivial case one arg" + '(a b c d e) (slset-xor '(a b c d e)))) + (test-group "extra examples" + (test "for multiple lists, returns elements that appear in an odd number of lists" + '(c d i u e x) (slset-xor '(a b c d e) '(a e i o u) '(b o e) '(x))) + (test "duplicates in lists are preserved" + '(b b c d d i o u o) (slset-xor '(a b b c d e d) '(a e i o u o))))) + + (test-group "slset-difference" + (test-group "examples from srfi-1 document" + (test '(b c d) (slset-difference '(a b c d e) '(a e i o u))) + (test "trivial case" + '(a b c) (slset-difference '(a b c)))) + (test-group "more examples" + (test "duplicates in first list are preserved" + '(b b c d) + (slset-difference '(a b b c x x d x i e) '(a e i o u) '() '(x x y z z))) + (test "empty list returns same list" + '(a b b c d e) + (slset-difference '(a b b c d e) '())) + (test "empty list is empty" + '() + (slset-difference '() '(a b b c d e))) + (test "empty list is still empty after other args" + '() + (slset-difference '() '(a b) '(c d e) '())))) + + (test-group "slset-intersection" + (test-group "examples from srfi-1 document" + (test '(a e) (slset-intersection '(a b c d e) '(a e i o u))) + (test "repeated elements in LIST1 are preserved" + '(a x a) (slset-intersection '(a x y a) '(x a x z))) + (test "trivial case" + '(a b c) (slset-intersection '(a b c)))) + (test-group "more examples" + (test "multiple lists" + '(a a) (slset-intersection '(a x b y b a) '(x a x z) '(a b))))) + + (test-group "slset-deduplicate (as per srfi-1's delete-duplicates)" + (test-group "example from srfi-1 document" + (test '(a b c z) + (slset-deduplicate '(a b a c a b c z)))) + (test-group "more examples" + (test '() (slset-deduplicate '()))))) + +(test-exit) |