今天我们学习 Day 8:collections 模块妙用。这个模块提供了许多强劲的数据结构,能极大提升代码的效率和可读性。
深度解析:collections 模块
1.defaultdict:带默认值的字典
传统字典的问题:
# 需要手动检查键是否存在
word_count = {}
words = ["apple", "banana", "apple", "cherry", "banana", "apple"]
for word in words:
if word in word_count:
word_count[word] += 1
else:
word_count[word] = 1
print(word_count) # 输出: {'apple': 3, 'banana': 2, 'cherry': 1}
使用 defaultdict:
from collections import defaultdict
word_count = defaultdict(int) # 默认值为 0
words = ["apple", "banana", "apple", "cherry", "banana", "apple"]
for word in words:
word_count[word] += 1 # 不需要检查键是否存在
print(dict(word_count)) # 输出: {'apple': 3, 'banana': 2, 'cherry': 1}
自定义默认值工厂:
# 默认值为空列表
list_dict = defaultdict(list)
list_dict['fruits'].append('apple')
list_dict['fruits'].append('banana')
print(list_dict) # 输出: defaultdict(<class 'list'>, {'fruits': ['apple', 'banana']})
# 默认值为空集合
set_dict = defaultdict(set)
set_dict['colors'].add('red')
set_dict['colors'].add('blue')
set_dict['colors'].add('red') # 重复元素不会被添加
print(set_dict) # 输出: defaultdict(<class 'set'>, {'colors': {'red', 'blue'}})
2.Counter:计数器
基础计数:
from collections import Counter
words = ["apple", "banana", "apple", "cherry", "banana", "apple"]
word_count = Counter(words)
print(word_count) # 输出: Counter({'apple': 3, 'banana': 2, 'cherry': 1})
print(word_count['apple']) # 输出: 3
print(word_count['orange']) # 输出: 0 (不会报错)
常用操作:
c = Counter(['a', 'b', 'c', 'a', 'b', 'a'])
print(c.most_common(2)) # 输出: [('a', 3), ('b', 2)]
print(sum(c.values())) # 输出: 6 (总计数)
# 更新计数器
c.update(['a', 'b', 'd'])
print(c) # 输出: Counter({'a': 4, 'b': 3, 'c': 1, 'd': 1})
# 数学运算
c1 = Counter(a=3, b=1)
c2 = Counter(a=1, b=2)
print(c1 + c2) # 输出: Counter({'a': 4, 'b': 3})
print(c1 - c2) # 输出: Counter({'a': 2})
print(c1 & c2) # 输出: Counter({'a': 1, 'b': 1}) (交集: min(c1[x], c2[x]))
print(c1 | c2) # 输出: Counter({'a': 3, 'b': 2}) (并集: max(c1[x], c2[x]))
3.namedtuple:命名元组
创建命名元组:
from collections import namedtuple
# 定义 Point 结构
Point = namedtuple('Point', ['x', 'y'])
p = Point(10, 20)
print(p.x, p.y) # 输出: 10 20
print(p[0], p[1]) # 输出: 10 20 (依旧支持索引访问)
实用特性:
# 多种定义字段的方式
Person = namedtuple('Person', 'name age city') # 字符串用空格分隔
Person = namedtuple('Person', ['name', 'age', 'city']) # 列表
Person = namedtuple('Person', 'name, age, city') # 字符串用逗号分隔
# 创建实例
person = Person('Alice', 25, 'New York')
print(f"{person.name}, {person.age}岁, 来自{person.city}")
# _asdict() 转换为字典
print(person._asdict()) # 输出: {'name': 'Alice', 'age': 25, 'city': 'New York'}
# _replace() 创建新实例(不可变对象的"修改")
new_person = person._replace(age=26, city='Boston')
print(new_person) # 输出: Person(name='Alice', age=26, city='Boston')
4.deque:双端队列
from collections import deque
# 创建双端队列
d = deque([1, 2, 3])
# 两端操作
d.append(4) # 右端添加: deque([1, 2, 3, 4])
d.appendleft(0) # 左端添加: deque([0, 1, 2, 3, 4])
d.pop() # 右端移除: 4
d.popleft() # 左端移除: 0
print(d) # 输出: deque([1, 2, 3])
# 其他有用方法
d.extend([4, 5]) # 右端扩展: deque([1, 2, 3, 4, 5])
d.extendleft([0, -1]) # 左端扩展: deque([-1, 0, 1, 2, 3, 4, 5])
d.rotate(2) # 向右旋转: deque([4, 5, -1, 0, 1, 2, 3])
5.OrderedDict:有序字典(Python 3.7+ 中普通dict已有序)
from collections import OrderedDict
# 保持插入顺序
od = OrderedDict()
od['z'] = 1
od['a'] = 2
od['c'] = 3
print(list(od.keys())) # 输出: ['z', 'a', 'c']
# 移动元素到末尾
od.move_to_end('a')
print(list(od.keys())) # 输出: ['z', 'c', 'a']
6. 实际应用场景
场景1:数据分组(defaultdict)
from collections import defaultdict
# 按首字母分组单词
words = ["apple", "banana", "cherry", "date", "elderberry", "fig"]
grouped = defaultdict(list)
for word in words:
grouped[word[0]].append(word)
print(dict(grouped))
# 输出: {'a': ['apple'], 'b': ['banana'], 'c': ['cherry'], 'd': ['date'], 'e': ['elderberry'], 'f': ['fig']}
场景2:词频分析(Counter)
from collections import Counter
import re
text = "apple banana apple cherry banana apple date apple elderberry"
words = re.findall(r'w+', text.lower())
word_freq = Counter(words)
print("最常见单词:", word_freq.most_common(3))
# 输出: [('apple', 4), ('banana', 2), ('cherry', 1)]
# 计算字符频率
char_freq = Counter(text.replace(' ', ''))
print("最常见字符:", char_freq.most_common(5))
场景3:数据处理(namedtuple)
from collections import namedtuple
# 定义数据格式
Employee = namedtuple('Employee', ['id', 'name', 'department', 'salary'])
employees = [
Employee(1, 'Alice', 'IT', 80000),
Employee(2, 'Bob', 'HR', 65000),
Employee(3, 'Charlie', 'IT', 90000)
]
# 计算部门平均薪资
dept_salaries = defaultdict(list)
for emp in employees:
dept_salaries[emp.department].append(emp.salary)
for dept, salaries in dept_salaries.items():
avg = sum(salaries) / len(salaries)
print(f"{dept}部门平均薪资: ${avg:,.2f}")
7. 高级技巧
技巧1:嵌套 defaultdict
from collections import defaultdict
# 创建嵌套的defaultdict
nested = defaultdict(lambda: defaultdict(int))
data = [('A', 'X', 1), ('A', 'Y', 2), ('B', 'X', 3), ('A', 'X', 4)]
for category, subcategory, value in data:
nested[category][subcategory] += value
print(dict(nested['A'])) # 输出: {'X': 5, 'Y': 2}
技巧2:Counter 的高级统计
from collections import Counter
# 找出出现次数最多的元素
data = [1, 2, 2, 3, 3, 3, 4, 4, 4, 4]
counter = Counter(data)
# 找出所有出现次数大于1的元素
common = {item: count for item, count in counter.items() if count > 1}
print(common) # 输出: {2: 2, 3: 3, 4: 4}
# 计算频率百分比
total = sum(counter.values())
percentages = {item: count/total * 100 for item, count in counter.items()}
print(percentages)
今日练习
练习1:使用 defaultdict
# 统计句子中每个单词的长度分布
sentence = "the quick brown fox jumps over the lazy dog"
# 创建 {单词长度: [单词列表]} 的映射
# 你的代码 here
练习2:使用 Counter
# 分析文本,找出最常用的3个单词和最常用的3个字符
text = "Python is awesome and Python is powerful"
# 你的代码 here
练习3:使用 namedtuple
# 创建学生成绩记录系统
# 定义Student结构: name, math_score, english_score, science_score
# 计算每个学生的平均分和总分
# 你的代码 here
练习答案:
# 练习1答案:
from collections import defaultdict
sentence = "the quick brown fox jumps over the lazy dog"
length_map = defaultdict(list)
for word in sentence.split():
length_map[len(word)].append(word)
print(dict(length_map))
# 练习2答案:
from collections import Counter
text = "Python is awesome and Python is powerful"
words = text.lower().split()
chars = list(text.replace(' ', '').lower())
word_counter = Counter(words)
char_counter = Counter(chars)
print("常用单词:", word_counter.most_common(3))
print("常用字符:", char_counter.most_common(3))
# 练习3答案:
from collections import namedtuple
Student = namedtuple('Student', ['name', 'math', 'english', 'science'])
students = [
Student('Alice', 85, 92, 78),
Student('Bob', 76, 88, 90),
Student('Charlie', 92, 95, 89)
]
for student in students:
total = student.math + student.english + student.science
average = total / 3
print(f"{student.name}: 总分{total}, 平均分{average:.1f}")
今日总结
- defaultdict: 处理缺失键的字典,避免KeyError
- Counter: 高效的计数工具,支持各种统计操作
- namedtuple: 创建可读性强的数据结构
- deque: 高效的双端队列操作
- OrderedDict: 保持插入顺序的字典
选择指南:
- 需要计数 → Counter
- 需要处理缺失键 → defaultdict
- 需要命名字段 → namedtuple
- 需要高效队列操作 → deque
这些工具能让你的代码更简洁、高效和Pythonic!明天我们将学习上下文管理器与 with 语句
© 版权声明
文章版权归作者所有,未经允许请勿转载。如内容涉嫌侵权,请在本页底部进入<联系我们>进行举报投诉!
THE END
















- 最新
- 最热
只看作者