最新消息:XAMPP默认安装之后是很不安全的,我们只需要点击左方菜单的 "安全"选项,按照向导操作即可完成安全设置。

python对数据集进行清洗与可视化

XAMPP案例 admin 581浏览 0评论

00Python

import os

import easygui as g

import glob

import pandas as pd

import xml.etree.ElementTree as ET

from tqdm import tqdm

import pandas_profiling

image_path = g.diropenbox( title= “请选择图像文件夹路径”,default=r”E:\python\标定数据清洗\00001001-00001500image”)#将default按照自己数据的位置设置,可以减轻繁琐操作

print(image_path)

xml_path = g.diropenbox( title= “请选择xml文件夹路径”,default=r”E:\python\标定数据清洗\00001001-00001500xml”)

print(xml_path)

image_lst = os.listdir(image_path)

xml_lst = os.listdir(xml_path)

print(“image list:”, len(image_lst))

print(“xml list:”, len(xml_lst))

print(“————————功能1:显示命名不规划的xml文件———————————————————”)

err_xml=[]

#显示命名不规划的xml文件

for xml in xml_lst:

if len(xml)!=12:#自己定义自己的命名规范格式

print(xml)

err_xml.append(xml)

if len(err_xml)==0:

print(“无不规范命名的xml文件”)

print(“————————功能2:缺失xml文件显示——————————————————————————”)

#缺失xml文件显示

missing_xml = []

for image in tqdm(image_lst):

xml = image[:-4] + ‘.xml’

if xml not in xml_lst:

missing_xml.append(xml[:-4])

print(“缺失xml文件数:”,len(missing_xml))

print(“缺失xml文件为:”,missing_xml)

print(“————————功能3:缺失图像显示—————————————————————————————”)

#缺失图像显示(或者说多余的xml)

missing_image = []

for xml in tqdm(xml_lst):

image = xml[:-4] + ‘.jpg’

if image not in image_lst:

missing_image.append(xml[:-4])

print(“缺失image文件数:”, len(missing_image))

print(“缺失image文件为:”, missing_image)

print(“————————功能4:删除没有对应xml的图片—————————————————————”)

drop_list1=[]

while len(missing_xml):

for index1 in missing_xml:

image = index1 + ‘.jpg’

os.remove(image_path + “\\”  + image)

missing_xml.remove(index1)

drop_list1.append(index1)

if len(drop_list1)>0:

print(“成功删除:”,drop_list1)

else:

print(“无缺失文件”)

print(“————————功能5:删除没有对应图片的xml文件——————————————————”)

drop_list2=[]

while len(missing_image):

for index2 in missing_image:

xml = index2 + ‘.xml’

os.remove(xml_path + “\\” + xml)

missing_image.remove(index2)

drop_list2.append(index2)

if len(drop_list2)>0:

print(“成功删除:”,drop_list2)

else:

print(“无缺失文件”)

print(“————————功能6:将xml文件写入csv文件——————————————————————”)

#将xml文件写入csv文件,方便后期数据分析

def xml_to_csv(path):

xml_list = []

for xml_file in glob.glob(path + “\\” + ‘*.xml’):

# print(xml_file)

tree = ET.parse(xml_file)

root = tree.getroot()

for member in root.findall(‘object’):

value = (root.find(‘filename’).text,

int(root.find(‘size’)[0].text),

int(root.find(‘size’)[1].text),

member[0].text,

int(member[4][0].text),

int(member[4][1].text),

int(member[4][2].text),

int(member[4][3].text)

)

xml_list.append(value)

column_name = [‘filename’, ‘width’, ‘height’, ‘class’, ‘xmin’, ‘ymin’, ‘xmax’, ‘ymax’]

xml_df = pd.DataFrame(xml_list, columns=column_name)

return xml_df

xml_df = xml_to_csv(xml_path)

xml_df.to_csv(‘labels.csv’, index=None)

print(‘Successfully 货币符号converted xml to csv.’)

print(“————————功能7:查看xml文件信息,生成报告———————————————————”)

def eda(in_file, out_file):

data = pd.read_csv(in_file, sep=’,’)

pfr = pandas_profiling.ProfileReport(data)

pfr.to_file(out_file)

in_file = ‘labels.csv’

out_file = ‘labels.html’

eda(in_file, out_file)

print(‘eda done!’)

print(“————————功能8:改写label出错的xml文件————————————————————”)

def main(path):

wrong_class_lst1, wrong_class_lst2, w_lst = [], [], []

for xml_file in glob.glob(path + ‘*.xml’):

print(xml_file)

tree = ET.parse(xml_file)

root = tree.getroot()

for member in root.findall(‘object’):

value = member[0].text

if value == ‘chemical_vehical’ or value == ‘chemcial_vehicle’ or value == ‘chemical_vehicel’:

wrong_class_lst1.append(root.find(‘filename’).text)

member[0].text = ‘chemical_vehicle’

if value == ‘chemical_sigh’:

wrong_class_lst2.append(root.find(‘filename’).text)

member[0].text = ‘chemical_sign’

if value == ‘w’:

w_lst.append(root.find(‘filename’).text)

tree.write(xml_file)

print(‘wrong_class_list1:’, wrong_class_lst1)

print(‘wrong_class_list2:’, wrong_class_lst1)

print(‘w_list:’, w_lst)

main(xml_path)

print(“完成!”)

 

转载请注明:XAMPP中文组官网 » python对数据集进行清洗与可视化

您必须 登录 才能发表评论!