Complex comparison of two strings

Mateen · October 13, 2020, 9:05am

drop table [employee]
go
CREATE TABLE [dbo].[employee](
	[serno] [int] NOT NULL,
	[empname] [varchar](100) NULL,
	[gl_name] [varchar](100) NULL,
 CONSTRAINT [PK_employee] PRIMARY KEY CLUSTERED 
(
	[serno] ASC
)WITH (PAD_INDEX = OFF, STATISTICS_NORECOMPUTE = OFF, IGNORE_DUP_KEY = OFF, 
 ALLOW_ROW_LOCKS = ON, ALLOW_PAGE_LOCKS = ON) ON [PRIMARY]
) ON [PRIMARY]

go

INSERT INTO [employee] ([serno],[empname],[gl_name])
VALUES(1,'MUHAMMAD ARMAN','MUHAMMAD ARMAN Dcity')

INSERT INTO [employee] ([serno],[empname],[gl_name])
VALUES(2,'MUHAMMAD KHURAM','MUHAMMAD KHURAM Dcity')

INSERT INTO [employee] ([serno],[empname],[gl_name])VALUES
(3,'USAMA AHMED','USAMA AHMED Dcity')

INSERT INTO [employee] ([serno],[empname],[gl_name])
VALUES(6,'MEHTISHAM MOHIUDDIN','MEHTISHAM MOHIUDDIN Dcity')

INSERT INTO [employee] ([serno],[empname],[gl_name])
VALUES(7,'HUMA KHAN','HUMA KHAN Dcity')

INSERT INTO [employee] ([serno],[empname],[gl_name])
VALUES(8,'MUHAMMAD AKASH KHAN','MUHAMMAD AKASH KHAN Dcity')

INSERT INTO [employee] ([serno],[empname],[gl_name])
VALUES(9,'ROBINA','Rubina Dcity')

INSERT INTO [employee] ([serno],[empname],[gl_name])
VALUES(10,'HINA NAZ','Hina Naz A-33')

INSERT INTO [employee] ([serno],[empname],[gl_name])
VALUES(11,'YASIR HUSSIAN','yaseer Hussain 177616')


INSERT INTO [employee] ([serno],[empname],[gl_name])
VALUES(14,'MUHAMMAD NOMAN SIDDIQI','MUHAMMAD NOMAN SIDDIQI Dcity')

INSERT INTO [employee] ([serno],[empname],[gl_name])
VALUES(15,'FEEROZ ANSARI','FEEROZ ANSARI Dcity')

INSERT INTO [employee] ([serno],[empname],[gl_name])
VALUES(16,'SHAHZAIB AHMED','SHAHZAIB AHMED Dcity')

INSERT INTO [employee] ([serno],[empname],[gl_name])
VALUES(17,'ABU HURIRAH','ABU HURIRAH Dcity')

INSERT INTO [employee] ([serno],[empname],[gl_name])
VALUES(18,'MUHAMMAD AHSAN','MUHAMMAD AHSAN Dcity')

INSERT INTO [employee] ([serno],[empname],[gl_name])
VALUES(19,'ABDUL KAREEM','ABDUL KAREEM A33)')

INSERT INTO [employee] ([serno],[empname],[gl_name])
VALUES(20,'IMRAN','IMRAN Dcity')

INSERT INTO [employee] ([serno],[empname],[gl_name])
VALUES(21,'MUHAMMAD TALHA FARAZ','MUHAMMAD TALHA FARAZ Dcity')

INSERT INTO [employee] ([serno],[empname],[gl_name])
VALUES(22,'WALEED AHMED KHOKHAR','WALEED AHMED KHOKHAR Dcity')

INSERT INTO [employee] ([serno],[empname],[gl_name])
VALUES(23,'HYDER ALI SOLANGI','Hyder Ali Solangi Dcity')

INSERT INTO [employee] ([serno],[empname],[gl_name])
VALUES(24,'ABDUL AZIZ KHAN','ABDUL AZIZ KHAN A33)')

INSERT INTO [employee] ([serno],[empname],[gl_name])
VALUES(29,'MEHMOOD UL HAQ','MEHMOOD UL HAQ Dcity')
INSERT INTO [employee] ([serno],[empname],[gl_name])
VALUES(31,'SHEIKH PERVEZ','Sheikh Pervez')

INSERT INTO [employee] ([serno],[empname],[gl_name])
VALUES(32,'AKBER ALI','Akber Ali 175652')

go

drop table common_words

go

create table common_words (words varchar(10))
go

insert into common_words select 'MUHAMMAD'
insert into common_words select 'AHMED'
insert into common_words select 'KHAN'
insert into common_words select 'ALI'
insert into common_words select 'HUSSAIN'
insert into common_words select 'ANSARI'
insert into common_words select 'DCITY'
insert into common_words select '('
insert into common_words select ')'
insert into common_words select '-'

Question:
collate case insensitive

to Compare columns emp_name, gl_name

before comparing ignore common words

if a single word in emp_name is found in gl_name
then it is ok
otherwise it is not ok and show me the not ok rows

result will be
serno empname gl_name
9 ROBINA Rubina Dcity
11 YASIR HUSSIAN yaseer Hussain 177616

mike01 · October 13, 2020, 1:27pm

are you looking for something like this??

Select top 100  * 
from #employee
	where gl_name not like '%' + empName + '%'

harishgg1 · October 13, 2020, 2:04pm

String split .. space

Row number to identify which row

Then check matches string split

Where no matches using row number get original table rows

Mateen · October 13, 2020, 3:51pm

No.
let's consider empid 19, where the two strings are

'ABDUL AZIZ KHAN','ABDUL AZIZ KHAN A33'
after ignoring common word 'khan', the strings will be
'ABDUL AZIZ,'ABDUL AZIZ A33'
the first string has two words
ABDUL, AZIZ
if any one of the above word found in second string
(in this case both found)
the it is ok , don't show this row

now take employee 9
ROBINA , Rubina Dcity
after ignoring common word dcity, it becomes
ROBINA , Rubina
only one word in first string not found in second, it is not ok
show this row

suppose the two strings would have been
ROBINA abc xyz , abc 123 pqr Dcity
after ignoring common word dcity the string becomes
ROBINA abc xyz , abc 123 pqr
words in first string are
ROBINA
abc
xyz
and out of those 3 words one word abc is found
in the second string, it is ok , don't show the row.

I hope , I am clear

Mateen · October 13, 2020, 3:57pm

@mike01

something like this
update  #employee
set empname= rplace(empname, [common word from table],'')
, gl_name=replace(glname,[common word from table],'')

select *
from  #employee
where any word of empname not in gl_name

Mateen · October 13, 2020, 4:02pm

Mateen:

something like this
update  #employee
set empname= rplace(empname, [common word from table],'')
, gl_name=replace(glname,[common word from table],'')

select *
from  #employee
where any word of empname not in gl_name

@harishgg1
yes.
definite use of string_split required

yosiasz · October 13, 2020, 4:59pm

maybe?

;with src
as
(
	select distinct [serno], empname, 
replace(empname, words, '') as clean_empname,  
			gl_name, replace(gl_name, words, '') clean_gl_name
		from employee e
		cross apply common_words cw
),
emp as (
	select distinct a.[serno], e.Item, e.ItemNumber, empname
	  from src a  
	  cross apply DelimitedSplit8K(a.clean_empname, ' ') e
	  where e.Item <> ''
),
gl as (
	select distinct a.[serno], g.Item, g.ItemNumber, gl_name
	  from src a  
	  cross apply DelimitedSplit8K(a.clean_gl_name, ' ') g
	  where g.Item <> ''
)
select distinct emp.serno, empname, gl_name
  from emp
  join gl on emp.serno = gl.serno
  where ( not exists(select 1 from emp where emp.Item = gl.Item)
	 or not exists(select 1 from gl where gl.Item = emp.Item)
	 )

yosiasz · October 13, 2020, 5:02pm

Close but no cigar. the above does not work? I am not sure, maybe it does. tested it and only SHEIKH PERVEZ is excluded.

Mateen · October 13, 2020, 6:07pm

Mateen:

drop table [employee]
go
CREATE TABLE [dbo].[employee](
	[serno] [int] NOT NULL,
	[empname] [varchar](100) NULL,
	[gl_name] [varchar](100) NULL,
 CONSTRAINT [PK_employee] PRIMARY KEY CLUSTERED 
(
	[serno] ASC
)WITH (PAD_INDEX = OFF, STATISTICS_NORECOMPUTE = OFF, IGNORE_DUP_KEY = OFF, 
 ALLOW_ROW_LOCKS = ON, ALLOW_PAGE_LOCKS = ON) ON [PRIMARY]
) ON [PRIMARY]

go

INSERT INTO [employee] ([serno],[empname],[gl_name])
VALUES(1,'MUHAMMAD ARMAN','MUHAMMAD ARMAN Dcity')

INSERT INTO [employee] ([serno],[empname],[gl_name])
VALUES(2,'MUHAMMAD KHURAM','MUHAMMAD KHURAM Dcity')

INSERT INTO [employee] ([serno],[empname],[gl_name])VALUES
(3,'USAMA AHMED','USAMA AHMED Dcity')

INSERT INTO [employee] ([serno],[empname],[gl_name])
VALUES(6,'MEHTISHAM MOHIUDDIN','MEHTISHAM MOHIUDDIN Dcity')

INSERT INTO [employee] ([serno],[empname],[gl_name])
VALUES(7,'HUMA KHAN','HUMA KHAN Dcity')

INSERT INTO [employee] ([serno],[empname],[gl_name])
VALUES(8,'MUHAMMAD AKASH KHAN','MUHAMMAD AKASH KHAN Dcity')

INSERT INTO [employee] ([serno],[empname],[gl_name])
VALUES(9,'ROBINA','Rubina Dcity')

INSERT INTO [employee] ([serno],[empname],[gl_name])
VALUES(10,'HINA NAZ','Hina Naz A-33')

INSERT INTO [employee] ([serno],[empname],[gl_name])
VALUES(11,'YASIR HUSSIAN','yaseer Hussain 177616')


INSERT INTO [employee] ([serno],[empname],[gl_name])
VALUES(14,'MUHAMMAD NOMAN SIDDIQI','MUHAMMAD NOMAN SIDDIQI Dcity')

INSERT INTO [employee] ([serno],[empname],[gl_name])
VALUES(15,'FEEROZ ANSARI','FEEROZ ANSARI Dcity')

INSERT INTO [employee] ([serno],[empname],[gl_name])
VALUES(16,'SHAHZAIB AHMED','SHAHZAIB AHMED Dcity')

INSERT INTO [employee] ([serno],[empname],[gl_name])
VALUES(17,'ABU HURIRAH','ABU HURIRAH Dcity')

INSERT INTO [employee] ([serno],[empname],[gl_name])
VALUES(18,'MUHAMMAD AHSAN','MUHAMMAD AHSAN Dcity')

INSERT INTO [employee] ([serno],[empname],[gl_name])
VALUES(19,'ABDUL KAREEM','ABDUL KAREEM A33)')

INSERT INTO [employee] ([serno],[empname],[gl_name])
VALUES(20,'IMRAN','IMRAN Dcity')

INSERT INTO [employee] ([serno],[empname],[gl_name])
VALUES(21,'MUHAMMAD TALHA FARAZ','MUHAMMAD TALHA FARAZ Dcity')

INSERT INTO [employee] ([serno],[empname],[gl_name])
VALUES(22,'WALEED AHMED KHOKHAR','WALEED AHMED KHOKHAR Dcity')

INSERT INTO [employee] ([serno],[empname],[gl_name])
VALUES(23,'HYDER ALI SOLANGI','Hyder Ali Solangi Dcity')

INSERT INTO [employee] ([serno],[empname],[gl_name])
VALUES(24,'ABDUL AZIZ KHAN','ABDUL AZIZ KHAN A33)')

INSERT INTO [employee] ([serno],[empname],[gl_name])
VALUES(29,'MEHMOOD UL HAQ','MEHMOOD UL HAQ Dcity')
INSERT INTO [employee] ([serno],[empname],[gl_name])
VALUES(31,'SHEIKH PERVEZ','Sheikh Pervez')

INSERT INTO [employee] ([serno],[empname],[gl_name])
VALUES(32,'AKBER ALI','Akber Ali 175652')

go

drop table common_words

go

create table common_words (words varchar(10))
go

insert into common_words select 'MUHAMMAD'
insert into common_words select 'AHMED'
insert into common_words select 'KHAN'
insert into common_words select 'ALI'
insert into common_words select 'HUSSAIN'
insert into common_words select 'ANSARI'
insert into common_words select 'DCITY'
insert into common_words select '('
insert into common_words select ')'
insert into common_words select '-'

Please wait
I have got idea and working on it

Thanks

Mateen · October 13, 2020, 6:39pm

@yosiasz

seems that it is ok.
will check in actual data with 530 rows, tomorrow , when in office

taking idea from your code


;with emp as
(
select e.serno,f.item
from employee e
cross apply DelimitedSplit8K(e.empname, ' ') f
where 1=1
and f.Item not in( select * from common_words)
)
, gl as
(
select e.serno,f.item
from employee e
cross apply DelimitedSplit8K(e.gl_name, ' ') f
where 1=1
and f.Item not in( select * from common_words)
),
not_found as
(
select emp.serno
,found= max(case when emp.item = gl.item then 1 else 0 end)
from emp 
join gl on (emp.serno=gl.serno)
group by emp.serno
having max(case when emp.item = gl.item then 1 else 0 end)=0
)
select e.*
from not_found nf
join employee e on nf.serno=e.serno

result

serno	empname	gl_name
9	ROBINA	Rubina Dcity
11	YASIR HUSSIAN	yaseer Hussain 177616

mike01 · October 13, 2020, 6:47pm

;with cte_emp as 
	(
		select e.*,  NewEmpName.Value as  NewEmpName  
		from #employee e
			cross apply string_split(empName, ' ') as NewEmpName
			left join #common_words c
				on NewEmpName.value = c.words
		where c.words is null),
	cte_gl as (
		select e.*, Newgl_name.value as Newgl_name
		from #employee e
			cross apply string_split(gl_name, ' ') as Newgl_name
			left join #common_words c
				on Newgl_name.value = c.words
		where c.words is null)

select * from cte_emp e
	where NewEmpName not in (select Newgl_Name from cte_gl g
								where g.serno = e.serno)

yosiasz · October 13, 2020, 7:00pm

select distinct empname, gl_name from cte_emp e
	where NewEmpName not in (select Newgl_Name from cte_gl g
								where g.serno = e.serno)

Mateen · October 13, 2020, 7:01pm

@mike01

Unfortunately I don't have sql server 2016

after simulating your code to lower version


;with cte_emp as 
	(
		select e.*,  NewEmpName.item as  NewEmpName  
		from employee e
			cross apply DelimitedSplit8K(empName, ' ') as NewEmpName
			left join common_words c
				on NewEmpName.item = c.words
		where c.words is null),
	cte_gl as (
		select e.*, Newgl_name.item as Newgl_name
		from employee e
			cross apply DelimitedSplit8K(gl_name, ' ') as Newgl_name
			left join common_words c
				on Newgl_name.item = c.words
		where c.words is null)
/* correctly suggested by @yosiasz */
select distinct empname, gl_name from cte_emp e
	where NewEmpName not in (select Newgl_Name from cte_gl g
								where g.serno = e.serno)

result

empname	gl_name
ROBINA	Rubina Dcity
YASIR HUSSIAN	yaseer Hussain 177616

and it's correct

Thanks

Mateen · October 13, 2020, 7:08pm

I edited my above post with your suggestion

Thanks

Mateen · October 14, 2020, 4:42am

@mike01

Your query is fast ,

if i add a ok row (ok since FEROZ found in both)

INSERT INTO [employee] ([serno],[empname],[gl_name])
VALUES(100,'FEROZ AKHTER','Feroz Akhtar A-33 (Staff)')

it is not excluding the above row in the result.

mike01 · October 14, 2020, 1:10pm

I understand now. Use the same CTE, but try this

select e.*
  from cte_emp e
	left join (
				select distinct  e.serno 
				  from cte_emp e
					join cte_gl g
						on g.serno = e.serno
						and g.Newgl_name = e.NewEmpName) g
		on g.serno = e.serno
where g.serno is null

Mateen · October 14, 2020, 3:53pm

Yes @mike01

it worked

;with cte_emp as 
	(
		select e.*,  NewEmpName.value as  NewEmpName  
		from employee e
			cross apply string_split(empName, ' ') as NewEmpName
			left join common_words c
				on NewEmpName.value = c.words
		where c.words is null),
	cte_gl as (
		select e.*, Newgl_name.value as Newgl_name
		from employee e
			cross apply string_split(gl_name, ' ') as Newgl_name
			left join common_words c
				on Newgl_name.value = c.words
		where c.words is null)
select distinct e.serno, e.empname, e.gl_name
  from cte_emp e
	left join (
				select distinct  e.serno 
				  from cte_emp e
					join cte_gl g
						on g.serno = e.serno
						and g.Newgl_name = e.NewEmpName) g
		on g.serno = e.serno
where g.serno is null

serno	empname	gl_name
9	ROBINA	Rubina Dcity
11	YASIR HUSSIAN	yaseer Hussain 177616

Thanks and regards.

Mateen · October 14, 2020, 4:01pm

Hi experts

I have got the initial breakthrough and thinking to make it even better
by creating one more table for synonym
here, in my native language
somebody writes 'Akhtar' and some 'Akhter'
by creating a synonym it will become same and
excluded from unmatched rows

Thank you all experts.