Python论坛  - 讨论区

标题:[python-chinese] BBS帖子推荐问题总结

2006年04月30日 星期日 15:33

Albert Lee hanzhupeng at gmail.com
Sun Apr 30 15:33:23 HKT 2006

感谢列表内朋友的帮助。

问题背景:一个论坛,每个帖子有一个tid, 每一个用户有一个uid,当用户回复了某帖子,就生成: tid;uid
这样一条记录。根据这个记录集,进行帖子推荐和用户推荐:关注这个帖子的人也关注如下帖子。
    输出结果: 帖子相关集与用户相关集。得出每个帖子最相关的10个帖子,以及每个用户最相关的10个帖子。

数据实例: 输入数据为23万条回复纪录,包含20310篇帖子,以及3844个回复用户。完整数据随后附上

5423;13593
5423;12301
5423;12201
5423;12201
5423;12597
5423;12348
49;12171
49;12200
49;12171
49;12219
49;12185
49;12235
50;12184
50;12209
50;12219
......

输出实例:
Post: 帖子相关数据
26297:25547 23850 25013 26213 637 26295 22421 25613 22696 26002
25869:24775 8985 24405 25426 637 26046 25930 25280 25693 26002
26277:25930 13016 24059 20517 23837 24440 25640 25914 25966 24613
26302:26277 26302 637 24775 26046 25930 25931 20522 19749 22421
25426:26098 25930 25390 15318 637 26002 810 26030 8985 25547
.....
User: 用户相关数据
12873:12180 12853 12253 13218 12276 13033 12749 13159 12255 13699
14103:14591 13218 12180 13214 16522 13224 15636 14580 13186 12873
14591:12180 15636 16522 13224 14580 14381 13218 13457 12253 14937
......

处理程序:

sim.py  (感谢 Yang jie 先生)
def read_data(filename):
    data = open(filename, 'r')
    post = {}   # post[pid] = [uid_1, uid_2, ..., uid_m]
    user = {}   # user[uid] = [pid_1, pid_2, ..., pid_m]
    for line in data.readlines():
        pid, uid = line.split(';')
        pid = int(pid)
        uid = int(uid)
        if post.has_key(pid):
            post[pid].append(uid)
        else:
            post[pid] = [uid]
        if user.has_key(uid):
            user[uid].append(pid)
        else:
            user[uid] = [pid]
    for uid in user:
        user[uid].sort()
    for pid in post:
        post[pid].sort()
    return post, user

def cooccurrence(pref1, pref2):
    #pref1.sort()   # already sorted
    #pref2.sort()
    i = 0
    j = 0
    co = 0
    size1 = len(pref1)
    size2 = len(pref2)
    while 1:
        if (i== size1) or (j==size2): break
        if pref1[i] < pref2[j] :
            i=i+1
        elif pref1[i]  > pref2[j] :
            j=j+1
        else:
            co +=1
            i+=1
            j+=1
    return co

def all_sim(data):
    all_key = data.keys()
    num_line = len(all_key)
    res = {}
    for k in data:
        res[k] = []
    for i in range(num_line):
        print i,
        for j in range(i+1, num_line):
            key_i = all_key[i]
            key_j = all_key[j]
            co = cooccurrence(data[key_i], data[key_j])
            res[key_i].append((co, key_j))
            res[key_j].append((co, key_i))
    for i in res:
        res[i].sort()
        res[i].reverse()
    return res

def write_res(res, filename):
    data = open(filename, 'w')
    res_lines = []
    for i in res:
        line = res[i]
        res_lines.append((i, [k for co, k in line]))
    res_lines.sort()
    for i in res_lines:
        data.write('%d: %s\n'%(i[0], i[1]))


if __name__ == "__main__":
    post, user = read_data('pw_posts.csv')
    print len(post), len(user)  #20310, 3844
    res2 = all_sim(user)
    print "finding similar users is done"
    write_res(res2, 'sim_user.txt')
    res1 = all_sim(post)
    print "finding similar posts is done"
    write_res(res1, 'sim_post.txt')



我把这个程序翻译成 Common Lisp 的,主要是学习之用

(defun split-by-colon (string)
    (loop for i = 0 then (1+ j)
          as j = (position #\; string :start i)
          collect (subseq string i j)
          while j))

(defun read-src ()
  (let ((in (open "myprog/pw_posts.csv" :if-does-not-exist nil))
    (data '()))
    (when in
      (loop for line = (read-line in nil)
        while line do (push (split-by-colon line) data))
      (close in))
    data))

(defun get-dic (data)
  (let ((post (make-hash-table :test 'equal))
    (user (make-hash-table :test 'equal)))
    (dolist (pair data)
      (progn
    (if (gethash (car pair) post)
        (push (second pair) (gethash (car pair) post))
        (setf (gethash (car pair) post) (cdr pair))))
    (if (gethash (second pair) user)
        (push (car pair) (gethash (second pair) user))
        (setf (gethash (second pair) user) (list (car pair)))))
    (list post user)))


(defvar *DATA* nil)
(defvar *post* nil)
(defvar *user* nil)

(defun load-data ()
  (let ((d (get-dic (read-src))))
    (format t "User count: ~A~%Post count: ~A~%" (hash-table-count (car d))
        (hash-table-count (second d)))
    (setq *DATA* d)
    (setq *post* (car d))
    (setq *user* (second d)))
  t)


(defun cooccurrence (pref1 pref2)
  (length (intersection
       (remove-duplicates pref1)
       (remove-duplicates pref2) :test 'equal)))

(defun sort-it (list)
  (sort list #'(lambda (x y)
         (> (second x)(second y)))))

(defun proc-res-list (list)
  (let ((res (subseq (sort-it list) 1 11)))
    (format t " ~A~%" res)
    res))


(defun all-sim (data)
  (let ((res (make-hash-table :test 'equal))
    (i 0))
    (loop for k being the hash-keys in data using (hash-value v)
      do (progn
           (setq i (+ i 1))
           (format t "~A- ~A : " i k)
           (setf (gethash k res) '())
           (loop for k2 being the hash-keys in data using (hash-value v2)
             do (push (list k2 (cooccurrence v v2)) (gethash k res)))
           (setf (gethash k res) (proc-res-list (gethash k res)))))
    res))


(defun write-res (data filename)
  (let ((out (open filename :direction :output :if-exists :supersede)))
    (when out
      (loop for k being the hash-key in data using (hash-value v)
        do (progn
         (write-string k out)
         (write-string ":" out)
         (dolist (p v) (progn
                 (write-string (car p) out)
                 (write-string " " out)))
         (write-line "" out)))
      (close out))))

(defun calc ()
  (progn
    (load-data)
    (write-res (all-sim *post*) "post_res.dat")
    (write-res (all-sim *user*) "user_res.dat")))



--
Welcome to my blog ( about Python , Lisp)
http://albertlee.cublog.cn/
-------------- next part --------------
An HTML attachment was scrubbed...
URL: http://lists.exoweb.net/pipermail/python-chinese/attachments/20060430/1966dba7/attachment-0001.html

[导入自Mailman归档:http://www.zeuux.org/pipermail/zeuux-python]

2006年04月30日 星期日 15:48

Zoom.Quiet zoom.quiet at gmail.com
Sun Apr 30 15:48:58 HKT 2006

On 4/30/06, Albert Lee <hanzhupeng at gmail.com> wrote:
>
> 感谢列表内朋友的帮助。
>
> 问题背景:一个论坛,每个帖子有一个tid, 每一个用户有一个uid,当用户回复了某帖子,就生成: tid;uid 这样一条记录。根据这个记录集,进行帖子推荐和用户推荐:关注这个帖子的人也关注如下帖子。
>     输出结果: 帖子相关集与用户相关集。得出每个帖子最相关的10个帖子,以及每个用户最相关的10个帖子。
>
> 数据实例: 输入数据为23万条回复纪录,包含20310篇帖子,以及3844个回复用户。完整数据随后附上
太大的话,直接在啄木鸟维基中组织一下子,
为了方便理解,也注释一下子哪!!!
而且还有你运行环境的说明,和几个程序的运行情况?可以比较一下子算法的威力…………

>
> 5423;13593
> 5423;12301
> 5423;12201
> 5423;12201
> 5423;12597
> 5423;12348
> 49;12171
> 49;12200
> 49;12171
> 49;12219
> 49;12185
> 49;12235
> 50;12184
> 50;12209
> 50;12219
> ......
>
> 输出实例:
>  Post: 帖子相关数据
>
> 26297:25547 23850 25013 26213 637 26295 22421 25613 22696 26002
> 25869:24775 8985 24405 25426 637 26046 25930 25280 25693 26002
> 26277:25930 13016 24059 20517 23837 24440 25640 25914 25966 24613
> 26302:26277 26302 637 24775 26046 25930 25931 20522 19749 22421
> 25426:26098 25930 25390 15318 637 26002 810 26030 8985 25547
> .....
>  User: 用户相关数据
>
> 12873:12180 12853 12253 13218 12276 13033 12749 13159 12255 13699
> 14103:14591 13218 12180 13214 16522 13224 15636 14580 13186 12873
> 14591:12180 15636 16522 13224 14580 14381 13218 13457 12253 14937
> ......
>
> 处理程序:
>
> sim.py  (感谢 Yang jie 先生)
> def read_data(filename):
>     data = open(filename, 'r')
>     post = {}   # post[pid] = [uid_1, uid_2, ..., uid_m]
>     user = {}   # user[uid] = [pid_1, pid_2, ..., pid_m]
>     for line in data.readlines():
>         pid, uid = line.split(';')
>         pid = int(pid)
>         uid = int(uid)
>         if post.has_key(pid):
>              post[pid].append(uid)
>         else:
>             post[pid] = [uid]
>         if user.has_key(uid):
>             user[uid].append(pid)
>         else:
>             user[uid] = [pid]
>     for uid in user:
>         user[uid].sort()
>     for pid in post:
>         post[pid].sort()
>     return post, user
>
> def cooccurrence(pref1, pref2):
>     #pref1.sort()   # already sorted
>     #pref2.sort()
>     i = 0
>      j = 0
>     co = 0
>     size1 = len(pref1)
>     size2 = len(pref2)
>     while 1:
>         if (i== size1) or (j==size2): break
>         if pref1[i] < pref2[j] :
>             i=i+1
>         elif pref1[i]  > pref2[j] :
>             j=j+1
>         else:
>             co +=1
>             i+=1
>             j+=1
>     return co
>
> def all_sim(data):
>     all_key = data.keys()
>     num_line = len(all_key)
>     res = {}
>      for k in data:
>         res[k] = []
>     for i in range(num_line):
>         print i,
>         for j in range(i+1, num_line):
>             key_i = all_key[i]
>             key_j = all_key[j]
>             co = cooccurrence(data[key_i], data[key_j])
>             res[key_i].append((co, key_j))
>             res[key_j].append((co, key_i))
>     for i in res:
>         res[i].sort()
>         res[i].reverse()
>     return res
>
> def write_res(res, filename):
>     data = open(filename, 'w')
>     res_lines = []
>     for i in res:
>         line = res[i]
>         res_lines.append((i, [k for co, k in line]))
>     res_lines.sort()
>     for i in res_lines:
>          data.write('%d: %s\n'%(i[0], i[1]))
>
>
> if __name__ == "__main__":
>     post, user = read_data('pw_posts.csv')
>     print len(post), len(user)  #20310, 3844
>     res2 = all_sim(user)
>     print "finding similar users is done"
>     write_res(res2, 'sim_user.txt')
>     res1 = all_sim(post)
>     print "finding similar posts is done"
>     write_res(res1, 'sim_post.txt')
>
>
>
> 我把这个程序翻译成 Common Lisp 的,主要是学习之用
>
> (defun split-by-colon (string)
>     (loop for i = 0 then (1+ j)
>           as j = (position #\; string :start i)
>           collect (subseq string i j)
>           while j))
>
> (defun read-src ()
>   (let ((in (open "myprog/pw_posts.csv" :if-does-not-exist nil))
>     (data '()))
>     (when in
>       (loop for line = (read-line in nil)
>         while line do (push (split-by-colon line) data))
>       (close in))
>     data))
>
> (defun get-dic (data)
>   (let ((post (make-hash-table :test 'equal))
>     (user (make-hash-table :test 'equal)))
>     (dolist (pair data)
>       (progn
>     (if (gethash (car pair) post)
>         (push (second pair) (gethash (car pair) post))
>         (setf (gethash (car pair) post) (cdr pair))))
>     (if (gethash (second pair) user)
>         (push (car pair) (gethash (second pair) user))
>         (setf (gethash (second pair) user) (list (car pair)))))
>     (list post user)))
>
>
> (defvar *DATA* nil)
> (defvar *post* nil)
> (defvar *user* nil)
>
> (defun load-data ()
>   (let ((d (get-dic (read-src))))
>     (format t "User count: ~A~%Post count: ~A~%" (hash-table-count (car d))
>         (hash-table-count (second d)))
>     (setq *DATA* d)
>     (setq *post* (car d))
>     (setq *user* (second d)))
>   t)
>
>
> (defun cooccurrence (pref1 pref2)
>   (length (intersection
>        (remove-duplicates pref1)
>        (remove-duplicates pref2) :test 'equal)))
>
> (defun sort-it (list)
>   (sort list #'(lambda (x y)
>          (> (second x)(second y)))))
>
> (defun proc-res-list (list)
>   (let ((res (subseq (sort-it list) 1 11)))
>     (format t " ~A~%" res)
>     res))
>
>
> (defun all-sim (data)
>   (let ((res (make-hash-table :test 'equal))
>     (i 0))
>     (loop for k being the hash-keys in data using (hash-value v)
>       do (progn
>            (setq i (+ i 1))
>            (format t "~A- ~A : " i k)
>            (setf (gethash k res) '())
>            (loop for k2 being the hash-keys in data using (hash-value v2)
>              do (push (list k2 (cooccurrence v v2)) (gethash k res)))
>            (setf (gethash k res) (proc-res-list (gethash k res)))))
>     res))
>
>
> (defun write-res (data filename)
>   (let ((out (open filename :direction :output :if-exists :supersede)))
>     (when out
>       (loop for k being the hash-key in data using (hash-value v)
>         do (progn
>          (write-string k out)
>          (write-string ":" out)
>          (dolist (p v) (progn
>                  (write-string (car p) out)
>                  (write-string " " out)))
>          (write-line "" out)))
>       (close out))))
>
> (defun calc ()
>   (progn
>     (load-data)
>     (write-res (all-sim *post*) "post_res.dat")
>     (write-res (all-sim *user*) "user_res.dat")))
>
>
>
>
> --
> Welcome to my blog ( about Python , Lisp)
> http://albertlee.cublog.cn/
> _______________________________________________
> python-chinese
> Post: send python-chinese at lists.python.cn
> Subscribe: send subscribe to python-chinese-request at lists.python.cn
> Unsubscribe: send unsubscribe to  python-chinese-request at lists.python.cn
> Detail Info: http://python.cn/mailman/listinfo/python-chinese
>
>



--
"""Time is unimportant, only life important!
blogging  :  http://blog.zoomquiet.org/pyblosxom/
wiki enter:   http://wiki.woodpecker.org.cn/moin/ZoomQuiet
in douban:  http://www.douban.com/people/zoomq/
"""

[导入自Mailman归档:http://www.zeuux.org/pipermail/zeuux-python]

2006年04月30日 星期日 16:00

Albert Lee hanzhupeng at gmail.com
Sun Apr 30 16:00:13 HKT 2006

Skipped content of type multipart/alternative-------------- next part --------------
A non-text attachment was scrubbed...
Name: pw_posts.csv
Type: application/octet-stream
Size: 2696092 bytes
Desc: not available
Url : http://lists.exoweb.net/pipermail/python-chinese/attachments/20060430/23fb5e12/pw_posts-0001.obj

[导入自Mailman归档:http://www.zeuux.org/pipermail/zeuux-python]

如下红色区域有误,请重新填写。

    你的回复:

    请 登录 后回复。还没有在Zeuux哲思注册吗?现在 注册 !

    Zeuux © 2025

    京ICP备05028076号