Python论坛的帖子：

Sun Apr 30 15:33:23 HKT 2006

感谢列表内朋友的帮助。

问题背景：一个论坛，每个帖子有一个tid, 每一个用户有一个uid，当用户回复了某帖子，就生成： tid;uid
这样一条记录。根据这个记录集，进行帖子推荐和用户推荐：关注这个帖子的人也关注如下帖子。
    输出结果： 帖子相关集与用户相关集。得出每个帖子最相关的１０个帖子，以及每个用户最相关的１０个帖子。

数据实例： 输入数据为２３万条回复纪录，包含20310篇帖子，以及３８４４个回复用户。完整数据随后附上

5423;13593
5423;12301
5423;12201
5423;12201
5423;12597
5423;12348
49;12171
49;12200
49;12171
49;12219
49;12185
49;12235
50;12184
50;12209
50;12219
......

输出实例：
Post: 帖子相关数据
26297:25547 23850 25013 26213 637 26295 22421 25613 22696 26002
25869:24775 8985 24405 25426 637 26046 25930 25280 25693 26002
26277:25930 13016 24059 20517 23837 24440 25640 25914 25966 24613
26302:26277 26302 637 24775 26046 25930 25931 20522 19749 22421
25426:26098 25930 25390 15318 637 26002 810 26030 8985 25547
．．．．．
User: 用户相关数据
12873:12180 12853 12253 13218 12276 13033 12749 13159 12255 13699
14103:14591 13218 12180 13214 16522 13224 15636 14580 13186 12873
14591:12180 15636 16522 13224 14580 14381 13218 13457 12253 14937
．．．．．．

处理程序：

sim.py  (感谢 Yang jie 先生）
def read_data(filename):
    data = open(filename, 'r')
    post = {}   # post[pid] = [uid_1, uid_2, ..., uid_m]
    user = {}   # user[uid] = [pid_1, pid_2, ..., pid_m]
    for line in data.readlines():
        pid, uid = line.split(';')
        pid = int(pid)
        uid = int(uid)
        if post.has_key(pid):
            post[pid].append(uid)
        else:
            post[pid] = [uid]
        if user.has_key(uid):
            user[uid].append(pid)
        else:
            user[uid] = [pid]
    for uid in user:
        user[uid].sort()
    for pid in post:
        post[pid].sort()
    return post, user

def cooccurrence(pref1, pref2):
    #pref1.sort()   # already sorted
    #pref2.sort()
    i = 0
    j = 0
    co = 0
    size1 = len(pref1)
    size2 = len(pref2)
    while 1:
        if (i== size1) or (j==size2): break
        if pref1[i] < pref2[j] :
            i=i+1
        elif pref1[i]  > pref2[j] :
            j=j+1
        else:
            co +=1
            i+=1
            j+=1
    return co

def all_sim(data):
    all_key = data.keys()
    num_line = len(all_key)
    res = {}
    for k in data:
        res[k] = []
    for i in range(num_line):
        print i,
        for j in range(i+1, num_line):
            key_i = all_key[i]
            key_j = all_key[j]
            co = cooccurrence(data[key_i], data[key_j])
            res[key_i].append((co, key_j))
            res[key_j].append((co, key_i))
    for i in res:
        res[i].sort()
        res[i].reverse()
    return res

def write_res(res, filename):
    data = open(filename, 'w')
    res_lines = []
    for i in res:
        line = res[i]
        res_lines.append((i, [k for co, k in line]))
    res_lines.sort()
    for i in res_lines:
        data.write('%d: %s\n'%(i[0], i[1]))


if __name__ == "__main__":
    post, user = read_data('pw_posts.csv')
    print len(post), len(user)  #20310, 3844
    res2 = all_sim(user)
    print "finding similar users is done"
    write_res(res2, 'sim_user.txt')
    res1 = all_sim(post)
    print "finding similar posts is done"
    write_res(res1, 'sim_post.txt')


我把这个程序翻译成 Common Lisp 的，主要是学习之用

(defun split-by-colon (string)
    (loop for i = 0 then (1+ j)
          as j = (position #\; string :start i)
          collect (subseq string i j)
          while j))

(defun read-src ()
  (let ((in (open "myprog/pw_posts.csv" :if-does-not-exist nil))
    (data '()))
    (when in
      (loop for line = (read-line in nil)
        while line do (push (split-by-colon line) data))
      (close in))
    data))

(defun get-dic (data)
  (let ((post (make-hash-table :test 'equal))
    (user (make-hash-table :test 'equal)))
    (dolist (pair data)
      (progn
    (if (gethash (car pair) post)
        (push (second pair) (gethash (car pair) post))
        (setf (gethash (car pair) post) (cdr pair))))
    (if (gethash (second pair) user)
        (push (car pair) (gethash (second pair) user))
        (setf (gethash (second pair) user) (list (car pair)))))
    (list post user)))


(defvar *DATA* nil)
(defvar *post* nil)
(defvar *user* nil)

(defun load-data ()
  (let ((d (get-dic (read-src))))
    (format t "User count: ~A~%Post count: ~A~%" (hash-table-count (car d))
        (hash-table-count (second d)))
    (setq *DATA* d)
    (setq *post* (car d))
    (setq *user* (second d)))
  t)


(defun cooccurrence (pref1 pref2)
  (length (intersection
       (remove-duplicates pref1)
       (remove-duplicates pref2) :test 'equal)))

(defun sort-it (list)
  (sort list #'(lambda (x y)
         (> (second x)(second y)))))

(defun proc-res-list (list)
  (let ((res (subseq (sort-it list) 1 11)))
    (format t " ~A~%" res)
    res))


(defun all-sim (data)
  (let ((res (make-hash-table :test 'equal))
    (i 0))
    (loop for k being the hash-keys in data using (hash-value v)
      do (progn
           (setq i (+ i 1))
           (format t "~A- ~A : " i k)
           (setf (gethash k res) '())
           (loop for k2 being the hash-keys in data using (hash-value v2)
             do (push (list k2 (cooccurrence v v2)) (gethash k res)))
           (setf (gethash k res) (proc-res-list (gethash k res)))))
    res))


(defun write-res (data filename)
  (let ((out (open filename :direction :output :if-exists :supersede)))
    (when out
      (loop for k being the hash-key in data using (hash-value v)
        do (progn
         (write-string k out)
         (write-string ":" out)
         (dolist (p v) (progn
                 (write-string (car p) out)
                 (write-string " " out)))
         (write-line "" out)))
      (close out))))

(defun calc ()
  (progn
    (load-data)
    (write-res (all-sim *post*) "post_res.dat")
    (write-res (all-sim *user*) "user_res.dat")))


--
Welcome to my blog ( about Python , Lisp)
http://albertlee.cublog.cn/
-------------- next part --------------
An HTML attachment was scrubbed...
URL: http://lists.exoweb.net/pipermail/python-chinese/attachments/20060430/1966dba7/attachment-0001.html

标题：[python-chinese] BBS帖子推荐问题总结